2 # -*- coding: utf-8 -*-
15 import xml.etree.ElementTree
18 from urlparse import parse_qs
21 import cStringIO as StringIO
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
43 uploader: Nickname of the video uploader, unescaped.
44 upload_date: Video upload date (YYYYMMDD).
45 title: Video title, unescaped.
46 ext: Video filename extension.
48 The following fields are optional:
50 format: The video format, defaults to ext (used for --get-format)
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The .srt file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib2.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
80 def suitable(self, url):
81 """Receives a URL and returns True if suitable for this IE."""
82 return re.match(self._VALID_URL, url) is not None
85 """Getter method for _WORKING."""
89 """Initializes an instance (authentication, etc)."""
91 self._real_initialize()
94 def extract(self, url):
95 """Extracts URL information and returns it in list of dicts."""
97 return self._real_extract(url)
99 def set_downloader(self, downloader):
100 """Sets the downloader for this IE."""
101 self._downloader = downloader
103 def _real_initialize(self):
104 """Real initialization process. Redefine in subclasses."""
107 def _real_extract(self, url):
108 """Real extraction process. Redefine in subclasses."""
112 class YoutubeIE(InfoExtractor):
113 """Information extractor for youtube.com."""
117 (?:https?://)? # http(s):// (optional)
118 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
119 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
120 (?:.*?\#/)? # handle anchor (#/) redirect urls
121 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
122 (?: # the various things that can precede the ID:
123 (?:(?:v|embed|e)/) # v/ or embed/ or e/
124 |(?: # or the v= param in all its forms
125 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
126 (?:\?|\#!?) # the params delimiter ? or # or #!
127 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
130 )? # optional -> youtube.com/xxxx is OK
131 )? # all until now is optional -> you can pass the naked ID
132 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
133 (?(1).+)? # if we found the ID, everything can follow
135 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
136 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
137 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
138 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
139 _NETRC_MACHINE = 'youtube'
140 # Listed in order of quality
141 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
142 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
143 _video_extensions = {
149 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
155 _video_dimensions = {
173 def suitable(self, url):
174 """Receives a URL and returns True if suitable for this IE."""
175 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
177 def report_lang(self):
178 """Report attempt to set language."""
179 self._downloader.to_screen(u'[youtube] Setting language')
181 def report_login(self):
182 """Report attempt to log in."""
183 self._downloader.to_screen(u'[youtube] Logging in')
185 def report_age_confirmation(self):
186 """Report attempt to confirm age."""
187 self._downloader.to_screen(u'[youtube] Confirming age')
189 def report_video_webpage_download(self, video_id):
190 """Report attempt to download video webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
193 def report_video_info_webpage_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
197 def report_video_subtitles_download(self, video_id):
198 """Report attempt to download video info webpage."""
199 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
201 def report_information_extraction(self, video_id):
202 """Report attempt to extract video information."""
203 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
205 def report_unavailable_format(self, video_id, format):
206 """Report extracted video URL."""
207 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
209 def report_rtmp_download(self):
210 """Indicate the download will use the RTMP protocol."""
211 self._downloader.to_screen(u'[youtube] RTMP download detected')
213 def _closed_captions_xml_to_srt(self, xml_string):
215 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
216 # TODO parse xml instead of regex
217 for n, (start, dur_tag, dur, caption) in enumerate(texts):
218 if not dur: dur = '4'
220 end = start + float(dur)
221 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
222 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
223 caption = unescapeHTML(caption)
224 caption = unescapeHTML(caption) # double cycle, intentional
225 srt += str(n+1) + '\n'
226 srt += start + ' --> ' + end + '\n'
227 srt += caption + '\n\n'
230 def _print_formats(self, formats):
231 print('Available formats:')
233 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
235 def _real_initialize(self):
236 if self._downloader is None:
241 downloader_params = self._downloader.params
243 # Attempt to use provided username and password or .netrc data
244 if downloader_params.get('username', None) is not None:
245 username = downloader_params['username']
246 password = downloader_params['password']
247 elif downloader_params.get('usenetrc', False):
249 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
254 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
255 except (IOError, netrc.NetrcParseError), err:
256 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
260 request = urllib2.Request(self._LANG_URL)
263 urllib2.urlopen(request).read()
264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
265 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
268 # No authentication to be performed
274 'current_form': 'loginForm',
276 'action_login': 'Log In',
277 'username': username,
278 'password': password,
280 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
283 login_results = urllib2.urlopen(request).read()
284 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
285 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
287 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
288 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
294 'action_confirm': 'Confirm',
296 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
298 self.report_age_confirmation()
299 age_results = urllib2.urlopen(request).read()
300 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
301 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
304 def _real_extract(self, url):
305 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
306 mobj = re.search(self._NEXT_URL_RE, url)
308 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
310 # Extract video id from URL
311 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
313 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
315 video_id = mobj.group(2)
318 self.report_video_webpage_download(video_id)
319 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
321 video_webpage = urllib2.urlopen(request).read()
322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
323 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
326 # Attempt to extract SWF player URL
327 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
329 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
334 self.report_video_info_webpage_download(video_id)
335 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
336 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
337 % (video_id, el_type))
338 request = urllib2.Request(video_info_url)
340 video_info_webpage = urllib2.urlopen(request).read()
341 video_info = parse_qs(video_info_webpage)
342 if 'token' in video_info:
344 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
345 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
347 if 'token' not in video_info:
348 if 'reason' in video_info:
349 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
351 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
354 # Check for "rental" videos
355 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
356 self._downloader.trouble(u'ERROR: "rental" videos not supported')
359 # Start extracting information
360 self.report_information_extraction(video_id)
363 if 'author' not in video_info:
364 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
366 video_uploader = urllib.unquote_plus(video_info['author'][0])
369 if 'title' not in video_info:
370 self._downloader.trouble(u'ERROR: unable to extract video title')
372 video_title = urllib.unquote_plus(video_info['title'][0])
373 video_title = video_title.decode('utf-8')
376 if 'thumbnail_url' not in video_info:
377 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
379 else: # don't panic if we can't find it
380 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
384 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
386 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
387 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
388 for expression in format_expressions:
390 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
395 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
396 if video_description: video_description = clean_html(video_description)
397 else: video_description = ''
400 video_subtitles = None
401 if self._downloader.params.get('writesubtitles', False):
403 self.report_video_subtitles_download(video_id)
404 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
406 srt_list = urllib2.urlopen(request).read()
407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
408 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
409 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
410 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
411 if not srt_lang_list:
412 raise Trouble(u'WARNING: video has no closed captions')
413 if self._downloader.params.get('subtitleslang', False):
414 srt_lang = self._downloader.params.get('subtitleslang')
415 elif 'en' in srt_lang_list:
418 srt_lang = srt_lang_list.keys()[0]
419 if not srt_lang in srt_lang_list:
420 raise Trouble(u'WARNING: no closed captions found in the specified language')
421 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
423 srt_xml = urllib2.urlopen(request).read()
424 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
425 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
427 raise Trouble(u'WARNING: unable to download video subtitles')
428 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
429 except Trouble as trouble:
430 self._downloader.trouble(trouble[0])
432 if 'length_seconds' not in video_info:
433 self._downloader.trouble(u'WARNING: unable to extract video duration')
436 video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
439 video_token = urllib.unquote_plus(video_info['token'][0])
441 # Decide which formats to download
442 req_format = self._downloader.params.get('format', None)
444 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
445 self.report_rtmp_download()
446 video_url_list = [(None, video_info['conn'][0])]
447 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
448 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
449 url_data = [parse_qs(uds) for uds in url_data_strs]
450 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
451 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
453 format_limit = self._downloader.params.get('format_limit', None)
454 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
455 if format_limit is not None and format_limit in available_formats:
456 format_list = available_formats[available_formats.index(format_limit):]
458 format_list = available_formats
459 existing_formats = [x for x in format_list if x in url_map]
460 if len(existing_formats) == 0:
461 self._downloader.trouble(u'ERROR: no known formats available for video')
463 if self._downloader.params.get('listformats', None):
464 self._print_formats(existing_formats)
466 if req_format is None or req_format == 'best':
467 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
468 elif req_format == 'worst':
469 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
470 elif req_format in ('-1', 'all'):
471 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
473 # Specific formats. We pick the first in a slash-delimeted sequence.
474 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
475 req_formats = req_format.split('/')
476 video_url_list = None
477 for rf in req_formats:
479 video_url_list = [(rf, url_map[rf])]
481 if video_url_list is None:
482 self._downloader.trouble(u'ERROR: requested format not available')
485 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
489 for format_param, video_real_url in video_url_list:
491 video_extension = self._video_extensions.get(format_param, 'flv')
493 video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'),
494 self._video_dimensions.get(format_param, '???'))
497 'id': video_id.decode('utf-8'),
498 'url': video_real_url.decode('utf-8'),
499 'uploader': video_uploader.decode('utf-8'),
500 'upload_date': upload_date,
501 'title': video_title,
502 'ext': video_extension.decode('utf-8'),
503 'format': video_format,
504 'thumbnail': video_thumbnail.decode('utf-8'),
505 'description': video_description,
506 'player_url': player_url,
507 'subtitles': video_subtitles,
508 'duration': video_duration
513 class MetacafeIE(InfoExtractor):
514 """Information Extractor for metacafe.com."""
516 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
517 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
518 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
519 IE_NAME = u'metacafe'
521 def __init__(self, downloader=None):
522 InfoExtractor.__init__(self, downloader)
524 def report_disclaimer(self):
525 """Report disclaimer retrieval."""
526 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
528 def report_age_confirmation(self):
529 """Report attempt to confirm age."""
530 self._downloader.to_screen(u'[metacafe] Confirming age')
532 def report_download_webpage(self, video_id):
533 """Report webpage download."""
534 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
536 def report_extraction(self, video_id):
537 """Report information extraction."""
538 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
540 def _real_initialize(self):
541 # Retrieve disclaimer
542 request = urllib2.Request(self._DISCLAIMER)
544 self.report_disclaimer()
545 disclaimer = urllib2.urlopen(request).read()
546 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
547 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
553 'submit': "Continue - I'm over 18",
555 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
557 self.report_age_confirmation()
558 disclaimer = urllib2.urlopen(request).read()
559 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
560 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
563 def _real_extract(self, url):
564 # Extract id and simplified title from URL
565 mobj = re.match(self._VALID_URL, url)
567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
570 video_id = mobj.group(1)
572 # Check if video comes from YouTube
573 mobj2 = re.match(r'^yt-(.*)$', video_id)
574 if mobj2 is not None:
575 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
578 # Retrieve video webpage to extract further information
579 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
581 self.report_download_webpage(video_id)
582 webpage = urllib2.urlopen(request).read()
583 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
584 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
587 # Extract URL, uploader and title from webpage
588 self.report_extraction(video_id)
589 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
591 mediaURL = urllib.unquote(mobj.group(1))
592 video_extension = mediaURL[-3:]
594 # Extract gdaKey if available
595 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
599 gdaKey = mobj.group(1)
600 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
602 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 vardict = parse_qs(mobj.group(1))
607 if 'mediaData' not in vardict:
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
610 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
612 self._downloader.trouble(u'ERROR: unable to extract media URL')
614 mediaURL = mobj.group(1).replace('\\/', '/')
615 video_extension = mediaURL[-3:]
616 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
618 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
620 self._downloader.trouble(u'ERROR: unable to extract title')
622 video_title = mobj.group(1).decode('utf-8')
624 mobj = re.search(r'submitter=(.*?);', webpage)
626 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
628 video_uploader = mobj.group(1)
631 'id': video_id.decode('utf-8'),
632 'url': video_url.decode('utf-8'),
633 'uploader': video_uploader.decode('utf-8'),
634 'upload_date': u'NA',
635 'title': video_title,
636 'ext': video_extension.decode('utf-8'),
640 class DailymotionIE(InfoExtractor):
641 """Information Extractor for Dailymotion"""
643 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
644 IE_NAME = u'dailymotion'
646 def __init__(self, downloader=None):
647 InfoExtractor.__init__(self, downloader)
649 def report_download_webpage(self, video_id):
650 """Report webpage download."""
651 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
653 def report_extraction(self, video_id):
654 """Report information extraction."""
655 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
657 def _real_extract(self, url):
658 # Extract id and simplified title from URL
659 mobj = re.match(self._VALID_URL, url)
661 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
664 video_id = mobj.group(1).split('_')[0].split('?')[0]
666 video_extension = 'mp4'
668 # Retrieve video webpage to extract further information
669 request = urllib2.Request(url)
670 request.add_header('Cookie', 'family_filter=off')
672 self.report_download_webpage(video_id)
673 webpage = urllib2.urlopen(request).read()
674 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
675 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
678 # Extract URL, uploader and title from webpage
679 self.report_extraction(video_id)
680 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
682 self._downloader.trouble(u'ERROR: unable to extract media URL')
684 flashvars = urllib.unquote(mobj.group(1))
686 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
689 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
692 self._downloader.trouble(u'ERROR: unable to extract video URL')
695 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
697 self._downloader.trouble(u'ERROR: unable to extract video URL')
700 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
702 # TODO: support choosing qualities
704 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
706 self._downloader.trouble(u'ERROR: unable to extract title')
708 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
710 video_uploader = u'NA'
711 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
713 # lookin for official user
714 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
715 if mobj_official is None:
716 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
718 video_uploader = mobj_official.group(1)
720 video_uploader = mobj.group(1)
722 video_upload_date = u'NA'
723 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
725 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
728 'id': video_id.decode('utf-8'),
729 'url': video_url.decode('utf-8'),
730 'uploader': video_uploader.decode('utf-8'),
731 'upload_date': video_upload_date,
732 'title': video_title,
733 'ext': video_extension.decode('utf-8'),
737 class GoogleIE(InfoExtractor):
738 """Information extractor for video.google.com."""
740 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
741 IE_NAME = u'video.google'
743 def __init__(self, downloader=None):
744 InfoExtractor.__init__(self, downloader)
746 def report_download_webpage(self, video_id):
747 """Report webpage download."""
748 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
750 def report_extraction(self, video_id):
751 """Report information extraction."""
752 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
754 def _real_extract(self, url):
755 # Extract id from URL
756 mobj = re.match(self._VALID_URL, url)
758 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
761 video_id = mobj.group(1)
763 video_extension = 'mp4'
765 # Retrieve video webpage to extract further information
766 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
768 self.report_download_webpage(video_id)
769 webpage = urllib2.urlopen(request).read()
770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
774 # Extract URL, uploader, and title from webpage
775 self.report_extraction(video_id)
776 mobj = re.search(r"download_url:'([^']+)'", webpage)
778 video_extension = 'flv'
779 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
781 self._downloader.trouble(u'ERROR: unable to extract media URL')
783 mediaURL = urllib.unquote(mobj.group(1))
784 mediaURL = mediaURL.replace('\\x3d', '\x3d')
785 mediaURL = mediaURL.replace('\\x26', '\x26')
789 mobj = re.search(r'<title>(.*)</title>', webpage)
791 self._downloader.trouble(u'ERROR: unable to extract title')
793 video_title = mobj.group(1).decode('utf-8')
795 # Extract video description
796 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
798 self._downloader.trouble(u'ERROR: unable to extract video description')
800 video_description = mobj.group(1).decode('utf-8')
801 if not video_description:
802 video_description = 'No description available.'
804 # Extract video thumbnail
805 if self._downloader.params.get('forcethumbnail', False):
806 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
808 webpage = urllib2.urlopen(request).read()
809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
812 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
814 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
816 video_thumbnail = mobj.group(1)
817 else: # we need something to pass to process_info
821 'id': video_id.decode('utf-8'),
822 'url': video_url.decode('utf-8'),
824 'upload_date': u'NA',
825 'title': video_title,
826 'ext': video_extension.decode('utf-8'),
830 class PhotobucketIE(InfoExtractor):
831 """Information extractor for photobucket.com."""
833 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
834 IE_NAME = u'photobucket'
836 def __init__(self, downloader=None):
837 InfoExtractor.__init__(self, downloader)
839 def report_download_webpage(self, video_id):
840 """Report webpage download."""
841 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
843 def report_extraction(self, video_id):
844 """Report information extraction."""
845 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
847 def _real_extract(self, url):
848 # Extract id from URL
849 mobj = re.match(self._VALID_URL, url)
851 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
854 video_id = mobj.group(1)
856 video_extension = 'flv'
858 # Retrieve video webpage to extract further information
859 request = urllib2.Request(url)
861 self.report_download_webpage(video_id)
862 webpage = urllib2.urlopen(request).read()
863 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
864 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
867 # Extract URL, uploader, and title from webpage
868 self.report_extraction(video_id)
869 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
871 self._downloader.trouble(u'ERROR: unable to extract media URL')
873 mediaURL = urllib.unquote(mobj.group(1))
877 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
879 self._downloader.trouble(u'ERROR: unable to extract title')
881 video_title = mobj.group(1).decode('utf-8')
883 video_uploader = mobj.group(2).decode('utf-8')
886 'id': video_id.decode('utf-8'),
887 'url': video_url.decode('utf-8'),
888 'uploader': video_uploader,
889 'upload_date': u'NA',
890 'title': video_title,
891 'ext': video_extension.decode('utf-8'),
895 class YahooIE(InfoExtractor):
896 """Information extractor for video.yahoo.com."""
898 # _VALID_URL matches all Yahoo! Video URLs
899 # _VPAGE_URL matches only the extractable '/watch/' URLs
900 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
901 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
902 IE_NAME = u'video.yahoo'
904 def __init__(self, downloader=None):
905 InfoExtractor.__init__(self, downloader)
907 def report_download_webpage(self, video_id):
908 """Report webpage download."""
909 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
911 def report_extraction(self, video_id):
912 """Report information extraction."""
913 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
915 def _real_extract(self, url, new_video=True):
916 # Extract ID from URL
917 mobj = re.match(self._VALID_URL, url)
919 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
922 video_id = mobj.group(2)
923 video_extension = 'flv'
925 # Rewrite valid but non-extractable URLs as
926 # extractable English language /watch/ URLs
927 if re.match(self._VPAGE_URL, url) is None:
928 request = urllib2.Request(url)
930 webpage = urllib2.urlopen(request).read()
931 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
932 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
935 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
937 self._downloader.trouble(u'ERROR: Unable to extract id field')
939 yahoo_id = mobj.group(1)
941 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
943 self._downloader.trouble(u'ERROR: Unable to extract vid field')
945 yahoo_vid = mobj.group(1)
947 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
948 return self._real_extract(url, new_video=False)
950 # Retrieve video webpage to extract further information
951 request = urllib2.Request(url)
953 self.report_download_webpage(video_id)
954 webpage = urllib2.urlopen(request).read()
955 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
956 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
959 # Extract uploader and title from webpage
960 self.report_extraction(video_id)
961 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video title')
965 video_title = mobj.group(1).decode('utf-8')
967 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
969 self._downloader.trouble(u'ERROR: unable to extract video uploader')
971 video_uploader = mobj.group(1).decode('utf-8')
973 # Extract video thumbnail
974 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
976 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
978 video_thumbnail = mobj.group(1).decode('utf-8')
980 # Extract video description
981 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
983 self._downloader.trouble(u'ERROR: unable to extract video description')
985 video_description = mobj.group(1).decode('utf-8')
986 if not video_description:
987 video_description = 'No description available.'
989 # Extract video height and width
990 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
992 self._downloader.trouble(u'ERROR: unable to extract video height')
994 yv_video_height = mobj.group(1)
996 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
998 self._downloader.trouble(u'ERROR: unable to extract video width')
1000 yv_video_width = mobj.group(1)
1002 # Retrieve video playlist to extract media URL
1003 # I'm not completely sure what all these options are, but we
1004 # seem to need most of them, otherwise the server sends a 401.
1005 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1006 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1007 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1008 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1009 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1011 self.report_download_webpage(video_id)
1012 webpage = urllib2.urlopen(request).read()
1013 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1014 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1017 # Extract media URL from playlist XML
1018 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1020 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1022 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1023 video_url = unescapeHTML(video_url)
1026 'id': video_id.decode('utf-8'),
1028 'uploader': video_uploader,
1029 'upload_date': u'NA',
1030 'title': video_title,
1031 'ext': video_extension.decode('utf-8'),
1032 'thumbnail': video_thumbnail.decode('utf-8'),
1033 'description': video_description,
1037 class VimeoIE(InfoExtractor):
1038 """Information extractor for vimeo.com."""
1040 # _VALID_URL matches Vimeo URLs
1041 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1044 def __init__(self, downloader=None):
1045 InfoExtractor.__init__(self, downloader)
1047 def report_download_webpage(self, video_id):
1048 """Report webpage download."""
1049 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1051 def report_extraction(self, video_id):
1052 """Report information extraction."""
1053 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1055 def _real_extract(self, url, new_video=True):
1056 # Extract ID from URL
1057 mobj = re.match(self._VALID_URL, url)
1059 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1062 video_id = mobj.group(1)
1064 # Retrieve video webpage to extract further information
1065 request = urllib2.Request(url, None, std_headers)
1067 self.report_download_webpage(video_id)
1068 webpage = urllib2.urlopen(request).read()
1069 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1070 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1073 # Now we begin extracting as much information as we can from what we
1074 # retrieved. First we extract the information common to all extractors,
1075 # and latter we extract those that are Vimeo specific.
1076 self.report_extraction(video_id)
1078 # Extract the config JSON
1079 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1081 config = json.loads(config)
1083 self._downloader.trouble(u'ERROR: unable to extract info section')
1087 video_title = config["video"]["title"]
1090 video_uploader = config["video"]["owner"]["name"]
1092 # Extract video thumbnail
1093 video_thumbnail = config["video"]["thumbnail"]
1095 # Extract video description
1096 video_description = get_element_by_id("description", webpage.decode('utf8'))
1097 if video_description: video_description = clean_html(video_description)
1098 else: video_description = ''
1100 # Extract upload date
1101 video_upload_date = u'NA'
1102 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1103 if mobj is not None:
1104 video_upload_date = mobj.group(1)
1106 # Vimeo specific: extract request signature and timestamp
1107 sig = config['request']['signature']
1108 timestamp = config['request']['timestamp']
1110 # Vimeo specific: extract video codec and quality information
1111 # First consider quality, then codecs, then take everything
1112 # TODO bind to format param
1113 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1114 files = { 'hd': [], 'sd': [], 'other': []}
1115 for codec_name, codec_extension in codecs:
1116 if codec_name in config["video"]["files"]:
1117 if 'hd' in config["video"]["files"][codec_name]:
1118 files['hd'].append((codec_name, codec_extension, 'hd'))
1119 elif 'sd' in config["video"]["files"][codec_name]:
1120 files['sd'].append((codec_name, codec_extension, 'sd'))
1122 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1124 for quality in ('hd', 'sd', 'other'):
1125 if len(files[quality]) > 0:
1126 video_quality = files[quality][0][2]
1127 video_codec = files[quality][0][0]
1128 video_extension = files[quality][0][1]
1129 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1132 self._downloader.trouble(u'ERROR: no known codec found')
1135 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1136 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1141 'uploader': video_uploader,
1142 'upload_date': video_upload_date,
1143 'title': video_title,
1144 'ext': video_extension,
1145 'thumbnail': video_thumbnail,
1146 'description': video_description,
1150 class GenericIE(InfoExtractor):
1151 """Generic last-resort information extractor."""
1154 IE_NAME = u'generic'
1156 def __init__(self, downloader=None):
1157 InfoExtractor.__init__(self, downloader)
1159 def report_download_webpage(self, video_id):
1160 """Report webpage download."""
1161 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1162 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1164 def report_extraction(self, video_id):
1165 """Report information extraction."""
1166 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1168 def report_following_redirect(self, new_url):
1169 """Report information extraction."""
1170 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1172 def _test_redirect(self, url):
1173 """Check if it is a redirect, like url shorteners, in case restart chain."""
1174 class HeadRequest(urllib2.Request):
1175 def get_method(self):
1178 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1180 Subclass the HTTPRedirectHandler to make it use our
1181 HeadRequest also on the redirected URL
1183 def redirect_request(self, req, fp, code, msg, headers, newurl):
1184 if code in (301, 302, 303, 307):
1185 newurl = newurl.replace(' ', '%20')
1186 newheaders = dict((k,v) for k,v in req.headers.items()
1187 if k.lower() not in ("content-length", "content-type"))
1188 return HeadRequest(newurl,
1190 origin_req_host=req.get_origin_req_host(),
1193 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1195 class HTTPMethodFallback(urllib2.BaseHandler):
1197 Fallback to GET if HEAD is not allowed (405 HTTP error)
1199 def http_error_405(self, req, fp, code, msg, headers):
1203 newheaders = dict((k,v) for k,v in req.headers.items()
1204 if k.lower() not in ("content-length", "content-type"))
1205 return self.parent.open(urllib2.Request(req.get_full_url(),
1207 origin_req_host=req.get_origin_req_host(),
1211 opener = urllib2.OpenerDirector()
1212 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1213 HTTPMethodFallback, HEADRedirectHandler,
1214 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1215 opener.add_handler(handler())
1217 response = opener.open(HeadRequest(url))
1218 new_url = response.geturl()
1220 if url == new_url: return False
1222 self.report_following_redirect(new_url)
1223 self._downloader.download([new_url])
1226 def _real_extract(self, url):
1227 if self._test_redirect(url): return
1229 video_id = url.split('/')[-1]
1230 request = urllib2.Request(url)
1232 self.report_download_webpage(video_id)
1233 webpage = urllib2.urlopen(request).read()
1234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1235 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1237 except ValueError, err:
1238 # since this is the last-resort InfoExtractor, if
1239 # this error is thrown, it'll be thrown here
1240 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1243 self.report_extraction(video_id)
1244 # Start with something easy: JW Player in SWFObject
1245 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1247 # Broaden the search a little bit
1248 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1250 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1253 # It's possible that one of the regexes
1254 # matched, but returned an empty group:
1255 if mobj.group(1) is None:
1256 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1259 video_url = urllib.unquote(mobj.group(1))
1260 video_id = os.path.basename(video_url)
1262 # here's a fun little line of code for you:
1263 video_extension = os.path.splitext(video_id)[1][1:]
1264 video_id = os.path.splitext(video_id)[0]
1266 # it's tempting to parse this further, but you would
1267 # have to take into account all the variations like
1268 # Video Title - Site Name
1269 # Site Name | Video Title
1270 # Video Title - Tagline | Site Name
1271 # and so on and so forth; it's just not practical
1272 mobj = re.search(r'<title>(.*)</title>', webpage)
1274 self._downloader.trouble(u'ERROR: unable to extract title')
1276 video_title = mobj.group(1).decode('utf-8')
1278 # video uploader is domain name
1279 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1281 self._downloader.trouble(u'ERROR: unable to extract title')
1283 video_uploader = mobj.group(1).decode('utf-8')
1286 'id': video_id.decode('utf-8'),
1287 'url': video_url.decode('utf-8'),
1288 'uploader': video_uploader,
1289 'upload_date': u'NA',
1290 'title': video_title,
1291 'ext': video_extension.decode('utf-8'),
1295 class YoutubeSearchIE(InfoExtractor):
1296 """Information Extractor for YouTube search queries."""
1297 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1298 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1299 _max_youtube_results = 1000
1300 IE_NAME = u'youtube:search'
1302 def __init__(self, downloader=None):
1303 InfoExtractor.__init__(self, downloader)
1305 def report_download_page(self, query, pagenum):
1306 """Report attempt to download search page with given number."""
1307 query = query.decode(preferredencoding())
1308 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1310 def _real_extract(self, query):
1311 mobj = re.match(self._VALID_URL, query)
1313 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1316 prefix, query = query.split(':')
1318 query = query.encode('utf-8')
1320 self._download_n_results(query, 1)
1322 elif prefix == 'all':
1323 self._download_n_results(query, self._max_youtube_results)
1329 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1331 elif n > self._max_youtube_results:
1332 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1333 n = self._max_youtube_results
1334 self._download_n_results(query, n)
1336 except ValueError: # parsing prefix as integer fails
1337 self._download_n_results(query, 1)
1340 def _download_n_results(self, query, n):
1341 """Downloads a specified number of results for a query"""
1347 while (50 * pagenum) < limit:
1348 self.report_download_page(query, pagenum+1)
1349 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1350 request = urllib2.Request(result_url)
1352 data = urllib2.urlopen(request).read()
1353 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1354 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1356 api_response = json.loads(data)['data']
1358 new_ids = list(video['id'] for video in api_response['items'])
1359 video_ids += new_ids
1361 limit = min(n, api_response['totalItems'])
1364 if len(video_ids) > n:
1365 video_ids = video_ids[:n]
1366 for id in video_ids:
1367 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1371 class GoogleSearchIE(InfoExtractor):
1372 """Information Extractor for Google Video search queries."""
1373 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1374 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1375 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1376 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1377 _max_google_results = 1000
1378 IE_NAME = u'video.google:search'
1380 def __init__(self, downloader=None):
1381 InfoExtractor.__init__(self, downloader)
1383 def report_download_page(self, query, pagenum):
1384 """Report attempt to download playlist page with given number."""
1385 query = query.decode(preferredencoding())
1386 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1388 def _real_extract(self, query):
1389 mobj = re.match(self._VALID_URL, query)
1391 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1394 prefix, query = query.split(':')
1396 query = query.encode('utf-8')
1398 self._download_n_results(query, 1)
1400 elif prefix == 'all':
1401 self._download_n_results(query, self._max_google_results)
1407 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1409 elif n > self._max_google_results:
1410 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1411 n = self._max_google_results
1412 self._download_n_results(query, n)
1414 except ValueError: # parsing prefix as integer fails
1415 self._download_n_results(query, 1)
1418 def _download_n_results(self, query, n):
1419 """Downloads a specified number of results for a query"""
1425 self.report_download_page(query, pagenum)
1426 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1427 request = urllib2.Request(result_url)
1429 page = urllib2.urlopen(request).read()
1430 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1431 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1434 # Extract video identifiers
1435 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1436 video_id = mobj.group(1)
1437 if video_id not in video_ids:
1438 video_ids.append(video_id)
1439 if len(video_ids) == n:
1440 # Specified n videos reached
1441 for id in video_ids:
1442 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1445 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1446 for id in video_ids:
1447 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1450 pagenum = pagenum + 1
1453 class YahooSearchIE(InfoExtractor):
1454 """Information Extractor for Yahoo! Video search queries."""
1455 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1456 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1457 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1458 _MORE_PAGES_INDICATOR = r'\s*Next'
1459 _max_yahoo_results = 1000
1460 IE_NAME = u'video.yahoo:search'
1462 def __init__(self, downloader=None):
1463 InfoExtractor.__init__(self, downloader)
1465 def report_download_page(self, query, pagenum):
1466 """Report attempt to download playlist page with given number."""
1467 query = query.decode(preferredencoding())
1468 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1470 def _real_extract(self, query):
1471 mobj = re.match(self._VALID_URL, query)
1473 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1476 prefix, query = query.split(':')
1478 query = query.encode('utf-8')
1480 self._download_n_results(query, 1)
1482 elif prefix == 'all':
1483 self._download_n_results(query, self._max_yahoo_results)
1489 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1491 elif n > self._max_yahoo_results:
1492 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1493 n = self._max_yahoo_results
1494 self._download_n_results(query, n)
1496 except ValueError: # parsing prefix as integer fails
1497 self._download_n_results(query, 1)
1500 def _download_n_results(self, query, n):
1501 """Downloads a specified number of results for a query"""
1504 already_seen = set()
1508 self.report_download_page(query, pagenum)
1509 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1510 request = urllib2.Request(result_url)
1512 page = urllib2.urlopen(request).read()
1513 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1514 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1517 # Extract video identifiers
1518 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1519 video_id = mobj.group(1)
1520 if video_id not in already_seen:
1521 video_ids.append(video_id)
1522 already_seen.add(video_id)
1523 if len(video_ids) == n:
1524 # Specified n videos reached
1525 for id in video_ids:
1526 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1529 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1530 for id in video_ids:
1531 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1534 pagenum = pagenum + 1
1537 class YoutubePlaylistIE(InfoExtractor):
1538 """Information Extractor for YouTube playlists."""
1540 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1541 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1542 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1543 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1544 IE_NAME = u'youtube:playlist'
1546 def __init__(self, downloader=None):
1547 InfoExtractor.__init__(self, downloader)
1549 def report_download_page(self, playlist_id, pagenum):
1550 """Report attempt to download playlist page with given number."""
1551 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1553 def _real_extract(self, url):
1554 # Extract playlist id
1555 mobj = re.match(self._VALID_URL, url)
1557 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1561 if mobj.group(3) is not None:
1562 self._downloader.download([mobj.group(3)])
1565 # Download playlist pages
1566 # prefix is 'p' as default for playlists but there are other types that need extra care
1567 playlist_prefix = mobj.group(1)
1568 if playlist_prefix == 'a':
1569 playlist_access = 'artist'
1571 playlist_prefix = 'p'
1572 playlist_access = 'view_play_list'
1573 playlist_id = mobj.group(2)
1578 self.report_download_page(playlist_id, pagenum)
1579 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1580 request = urllib2.Request(url)
1582 page = urllib2.urlopen(request).read()
1583 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1584 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1587 # Extract video identifiers
1589 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1592 video_ids.extend(ids_in_page)
1594 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1596 pagenum = pagenum + 1
1598 playliststart = self._downloader.params.get('playliststart', 1) - 1
1599 playlistend = self._downloader.params.get('playlistend', -1)
1600 if playlistend == -1:
1601 video_ids = video_ids[playliststart:]
1603 video_ids = video_ids[playliststart:playlistend]
1605 for id in video_ids:
1606 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1610 class YoutubeChannelIE(InfoExtractor):
1611 """Information Extractor for YouTube channels."""
1613 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1614 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1615 _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1616 IE_NAME = u'youtube:channel'
1618 def report_download_page(self, channel_id, pagenum):
1619 """Report attempt to download channel page with given number."""
1620 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1622 def _real_extract(self, url):
1623 # Extract channel id
1624 mobj = re.match(self._VALID_URL, url)
1626 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1629 # Download channel pages
1630 channel_id = mobj.group(1)
1635 self.report_download_page(channel_id, pagenum)
1636 url = self._TEMPLATE_URL % (channel_id, pagenum)
1637 request = urllib2.Request(url)
1639 page = urllib2.urlopen(request).read()
1640 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1641 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1644 # Extract video identifiers
1646 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1647 if mobj.group(1) not in ids_in_page:
1648 ids_in_page.append(mobj.group(1))
1649 video_ids.extend(ids_in_page)
1651 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1653 pagenum = pagenum + 1
1655 for id in video_ids:
1656 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1660 class YoutubeUserIE(InfoExtractor):
1661 """Information Extractor for YouTube users."""
1663 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1664 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1665 _GDATA_PAGE_SIZE = 50
1666 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1667 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1668 IE_NAME = u'youtube:user'
1670 def __init__(self, downloader=None):
1671 InfoExtractor.__init__(self, downloader)
1673 def report_download_page(self, username, start_index):
1674 """Report attempt to download user page."""
1675 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1676 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1678 def _real_extract(self, url):
1680 mobj = re.match(self._VALID_URL, url)
1682 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1685 username = mobj.group(1)
1687 # Download video ids using YouTube Data API. Result size per
1688 # query is limited (currently to 50 videos) so we need to query
1689 # page by page until there are no video ids - it means we got
1696 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1697 self.report_download_page(username, start_index)
1699 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1702 page = urllib2.urlopen(request).read()
1703 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1704 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1707 # Extract video identifiers
1710 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1711 if mobj.group(1) not in ids_in_page:
1712 ids_in_page.append(mobj.group(1))
1714 video_ids.extend(ids_in_page)
1716 # A little optimization - if current page is not
1717 # "full", ie. does not contain PAGE_SIZE video ids then
1718 # we can assume that this page is the last one - there
1719 # are no more ids on further pages - no need to query
1722 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1727 all_ids_count = len(video_ids)
1728 playliststart = self._downloader.params.get('playliststart', 1) - 1
1729 playlistend = self._downloader.params.get('playlistend', -1)
1731 if playlistend == -1:
1732 video_ids = video_ids[playliststart:]
1734 video_ids = video_ids[playliststart:playlistend]
1736 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1737 (username, all_ids_count, len(video_ids)))
1739 for video_id in video_ids:
1740 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1743 class BlipTVUserIE(InfoExtractor):
1744 """Information Extractor for blip.tv users."""
1746 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1748 IE_NAME = u'blip.tv:user'
1750 def __init__(self, downloader=None):
1751 InfoExtractor.__init__(self, downloader)
1753 def report_download_page(self, username, pagenum):
1754 """Report attempt to download user page."""
1755 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1756 (self.IE_NAME, username, pagenum))
1758 def _real_extract(self, url):
1760 mobj = re.match(self._VALID_URL, url)
1762 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1765 username = mobj.group(1)
1767 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1769 request = urllib2.Request(url)
1772 page = urllib2.urlopen(request).read().decode('utf-8')
1773 mobj = re.search(r'data-users-id="([^"]+)"', page)
1774 page_base = page_base % mobj.group(1)
1775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1780 # Download video ids using BlipTV Ajax calls. Result size per
1781 # query is limited (currently to 12 videos) so we need to query
1782 # page by page until there are no video ids - it means we got
1789 self.report_download_page(username, pagenum)
1791 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1794 page = urllib2.urlopen(request).read().decode('utf-8')
1795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1796 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1799 # Extract video identifiers
1802 for mobj in re.finditer(r'href="/([^"]+)"', page):
1803 if mobj.group(1) not in ids_in_page:
1804 ids_in_page.append(unescapeHTML(mobj.group(1)))
1806 video_ids.extend(ids_in_page)
1808 # A little optimization - if current page is not
1809 # "full", ie. does not contain PAGE_SIZE video ids then
1810 # we can assume that this page is the last one - there
1811 # are no more ids on further pages - no need to query
1814 if len(ids_in_page) < self._PAGE_SIZE:
1819 all_ids_count = len(video_ids)
1820 playliststart = self._downloader.params.get('playliststart', 1) - 1
1821 playlistend = self._downloader.params.get('playlistend', -1)
1823 if playlistend == -1:
1824 video_ids = video_ids[playliststart:]
1826 video_ids = video_ids[playliststart:playlistend]
1828 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1829 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1831 for video_id in video_ids:
1832 self._downloader.download([u'http://blip.tv/'+video_id])
1835 class DepositFilesIE(InfoExtractor):
1836 """Information extractor for depositfiles.com"""
1838 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1839 IE_NAME = u'DepositFiles'
1841 def __init__(self, downloader=None):
1842 InfoExtractor.__init__(self, downloader)
1844 def report_download_webpage(self, file_id):
1845 """Report webpage download."""
1846 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1848 def report_extraction(self, file_id):
1849 """Report information extraction."""
1850 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1852 def _real_extract(self, url):
1853 file_id = url.split('/')[-1]
1854 # Rebuild url in english locale
1855 url = 'http://depositfiles.com/en/files/' + file_id
1857 # Retrieve file webpage with 'Free download' button pressed
1858 free_download_indication = { 'gateway_result' : '1' }
1859 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1861 self.report_download_webpage(file_id)
1862 webpage = urllib2.urlopen(request).read()
1863 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1864 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1867 # Search for the real file URL
1868 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1869 if (mobj is None) or (mobj.group(1) is None):
1870 # Try to figure out reason of the error.
1871 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1872 if (mobj is not None) and (mobj.group(1) is not None):
1873 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1874 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1876 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1879 file_url = mobj.group(1)
1880 file_extension = os.path.splitext(file_url)[1][1:]
1882 # Search for file title
1883 mobj = re.search(r'<b title="(.*?)">', webpage)
1885 self._downloader.trouble(u'ERROR: unable to extract title')
1887 file_title = mobj.group(1).decode('utf-8')
1890 'id': file_id.decode('utf-8'),
1891 'url': file_url.decode('utf-8'),
1893 'upload_date': u'NA',
1894 'title': file_title,
1895 'ext': file_extension.decode('utf-8'),
1899 class FacebookIE(InfoExtractor):
1900 """Information Extractor for Facebook"""
1903 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1904 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1905 _NETRC_MACHINE = 'facebook'
1906 _available_formats = ['video', 'highqual', 'lowqual']
1907 _video_extensions = {
1912 IE_NAME = u'facebook'
1914 def __init__(self, downloader=None):
1915 InfoExtractor.__init__(self, downloader)
1917 def _reporter(self, message):
1918 """Add header and report message."""
1919 self._downloader.to_screen(u'[facebook] %s' % message)
1921 def report_login(self):
1922 """Report attempt to log in."""
1923 self._reporter(u'Logging in')
1925 def report_video_webpage_download(self, video_id):
1926 """Report attempt to download video webpage."""
1927 self._reporter(u'%s: Downloading video webpage' % video_id)
1929 def report_information_extraction(self, video_id):
1930 """Report attempt to extract video information."""
1931 self._reporter(u'%s: Extracting video information' % video_id)
1933 def _parse_page(self, video_webpage):
1934 """Extract video information from page"""
1936 data = {'title': r'\("video_title", "(.*?)"\)',
1937 'description': r'<div class="datawrap">(.*?)</div>',
1938 'owner': r'\("video_owner_name", "(.*?)"\)',
1939 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1942 for piece in data.keys():
1943 mobj = re.search(data[piece], video_webpage)
1944 if mobj is not None:
1945 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1949 for fmt in self._available_formats:
1950 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1951 if mobj is not None:
1952 # URL is in a Javascript segment inside an escaped Unicode format within
1953 # the generally utf-8 page
1954 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1955 video_info['video_urls'] = video_urls
1959 def _real_initialize(self):
1960 if self._downloader is None:
1965 downloader_params = self._downloader.params
1967 # Attempt to use provided username and password or .netrc data
1968 if downloader_params.get('username', None) is not None:
1969 useremail = downloader_params['username']
1970 password = downloader_params['password']
1971 elif downloader_params.get('usenetrc', False):
1973 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1974 if info is not None:
1978 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1979 except (IOError, netrc.NetrcParseError), err:
1980 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1983 if useremail is None:
1992 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1995 login_results = urllib2.urlopen(request).read()
1996 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1997 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1999 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2000 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2003 def _real_extract(self, url):
2004 mobj = re.match(self._VALID_URL, url)
2006 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2008 video_id = mobj.group('ID')
2011 self.report_video_webpage_download(video_id)
2012 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2014 page = urllib2.urlopen(request)
2015 video_webpage = page.read()
2016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2020 # Start extracting information
2021 self.report_information_extraction(video_id)
2023 # Extract information
2024 video_info = self._parse_page(video_webpage)
2027 if 'owner' not in video_info:
2028 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2030 video_uploader = video_info['owner']
2033 if 'title' not in video_info:
2034 self._downloader.trouble(u'ERROR: unable to extract video title')
2036 video_title = video_info['title']
2037 video_title = video_title.decode('utf-8')
2040 if 'thumbnail' not in video_info:
2041 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2042 video_thumbnail = ''
2044 video_thumbnail = video_info['thumbnail']
2048 if 'upload_date' in video_info:
2049 upload_time = video_info['upload_date']
2050 timetuple = email.utils.parsedate_tz(upload_time)
2051 if timetuple is not None:
2053 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2058 video_description = video_info.get('description', 'No description available.')
2060 url_map = video_info['video_urls']
2061 if len(url_map.keys()) > 0:
2062 # Decide which formats to download
2063 req_format = self._downloader.params.get('format', None)
2064 format_limit = self._downloader.params.get('format_limit', None)
2066 if format_limit is not None and format_limit in self._available_formats:
2067 format_list = self._available_formats[self._available_formats.index(format_limit):]
2069 format_list = self._available_formats
2070 existing_formats = [x for x in format_list if x in url_map]
2071 if len(existing_formats) == 0:
2072 self._downloader.trouble(u'ERROR: no known formats available for video')
2074 if req_format is None:
2075 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2076 elif req_format == 'worst':
2077 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2078 elif req_format == '-1':
2079 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2082 if req_format not in url_map:
2083 self._downloader.trouble(u'ERROR: requested format not available')
2085 video_url_list = [(req_format, url_map[req_format])] # Specific format
2088 for format_param, video_real_url in video_url_list:
2090 video_extension = self._video_extensions.get(format_param, 'mp4')
2093 'id': video_id.decode('utf-8'),
2094 'url': video_real_url.decode('utf-8'),
2095 'uploader': video_uploader.decode('utf-8'),
2096 'upload_date': upload_date,
2097 'title': video_title,
2098 'ext': video_extension.decode('utf-8'),
2099 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2100 'thumbnail': video_thumbnail.decode('utf-8'),
2101 'description': video_description.decode('utf-8'),
2105 class BlipTVIE(InfoExtractor):
2106 """Information extractor for blip.tv"""
2108 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2109 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2110 IE_NAME = u'blip.tv'
2112 def report_extraction(self, file_id):
2113 """Report information extraction."""
2114 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2116 def report_direct_download(self, title):
2117 """Report information extraction."""
2118 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2120 def _real_extract(self, url):
2121 mobj = re.match(self._VALID_URL, url)
2123 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2130 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2131 request = urllib2.Request(json_url.encode('utf-8'))
2132 self.report_extraction(mobj.group(1))
2135 urlh = urllib2.urlopen(request)
2136 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2137 basename = url.split('/')[-1]
2138 title,ext = os.path.splitext(basename)
2139 title = title.decode('UTF-8')
2140 ext = ext.replace('.', '')
2141 self.report_direct_download(title)
2146 'upload_date': u'NA',
2151 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2152 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2154 if info is None: # Regular URL
2156 json_code = urlh.read()
2157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2158 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2162 json_data = json.loads(json_code)
2163 if 'Post' in json_data:
2164 data = json_data['Post']
2168 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2169 video_url = data['media']['url']
2170 umobj = re.match(self._URL_EXT, video_url)
2172 raise ValueError('Can not determine filename extension')
2173 ext = umobj.group(1)
2176 'id': data['item_id'],
2178 'uploader': data['display_name'],
2179 'upload_date': upload_date,
2180 'title': data['title'],
2182 'format': data['media']['mimeType'],
2183 'thumbnail': data['thumbnailUrl'],
2184 'description': data['description'],
2185 'player_url': data['embedUrl']
2187 except (ValueError,KeyError), err:
2188 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2191 std_headers['User-Agent'] = 'iTunes/10.6.1'
2195 class MyVideoIE(InfoExtractor):
2196 """Information Extractor for myvideo.de."""
2198 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2199 IE_NAME = u'myvideo'
2201 def __init__(self, downloader=None):
2202 InfoExtractor.__init__(self, downloader)
2204 def report_download_webpage(self, video_id):
2205 """Report webpage download."""
2206 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2208 def report_extraction(self, video_id):
2209 """Report information extraction."""
2210 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2212 def _real_extract(self,url):
2213 mobj = re.match(self._VALID_URL, url)
2215 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2218 video_id = mobj.group(1)
2221 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2223 self.report_download_webpage(video_id)
2224 webpage = urllib2.urlopen(request).read()
2225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2229 self.report_extraction(video_id)
2230 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2233 self._downloader.trouble(u'ERROR: unable to extract media URL')
2235 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2237 mobj = re.search('<title>([^<]+)</title>', webpage)
2239 self._downloader.trouble(u'ERROR: unable to extract title')
2242 video_title = mobj.group(1)
2248 'upload_date': u'NA',
2249 'title': video_title,
2253 class ComedyCentralIE(InfoExtractor):
2254 """Information extractor for The Daily Show and Colbert Report """
2256 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2257 IE_NAME = u'comedycentral'
2259 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2261 _video_extensions = {
2269 _video_dimensions = {
2278 def report_extraction(self, episode_id):
2279 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2281 def report_config_download(self, episode_id):
2282 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2284 def report_index_download(self, episode_id):
2285 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2287 def report_player_url(self, episode_id):
2288 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2291 def _print_formats(self, formats):
2292 print('Available formats:')
2294 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2297 def _real_extract(self, url):
2298 mobj = re.match(self._VALID_URL, url)
2300 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2303 if mobj.group('shortname'):
2304 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2305 url = u'http://www.thedailyshow.com/full-episodes/'
2307 url = u'http://www.colbertnation.com/full-episodes/'
2308 mobj = re.match(self._VALID_URL, url)
2309 assert mobj is not None
2311 dlNewest = not mobj.group('episode')
2313 epTitle = mobj.group('showname')
2315 epTitle = mobj.group('episode')
2317 req = urllib2.Request(url)
2318 self.report_extraction(epTitle)
2320 htmlHandle = urllib2.urlopen(req)
2321 html = htmlHandle.read()
2322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2326 url = htmlHandle.geturl()
2327 mobj = re.match(self._VALID_URL, url)
2329 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2331 if mobj.group('episode') == '':
2332 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2334 epTitle = mobj.group('episode')
2336 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2338 if len(mMovieParams) == 0:
2339 # The Colbert Report embeds the information in a without
2340 # a URL prefix; so extract the alternate reference
2341 # and then add the URL prefix manually.
2343 altMovieParams = re.findall('data-mgid="([^"]*episode.*?:.*?)"', html)
2344 if len(altMovieParams) == 0:
2345 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2348 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2350 playerUrl_raw = mMovieParams[0][0]
2351 self.report_player_url(epTitle)
2353 urlHandle = urllib2.urlopen(playerUrl_raw)
2354 playerUrl = urlHandle.geturl()
2355 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2356 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2359 uri = mMovieParams[0][1]
2360 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2361 self.report_index_download(epTitle)
2363 indexXml = urllib2.urlopen(indexUrl).read()
2364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2365 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2370 idoc = xml.etree.ElementTree.fromstring(indexXml)
2371 itemEls = idoc.findall('.//item')
2372 for itemEl in itemEls:
2373 mediaId = itemEl.findall('./guid')[0].text
2374 shortMediaId = mediaId.split(':')[-1]
2375 showId = mediaId.split(':')[-2].replace('.com', '')
2376 officialTitle = itemEl.findall('./title')[0].text
2377 officialDate = itemEl.findall('./pubDate')[0].text
2379 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2380 urllib.urlencode({'uri': mediaId}))
2381 configReq = urllib2.Request(configUrl)
2382 self.report_config_download(epTitle)
2384 configXml = urllib2.urlopen(configReq).read()
2385 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2386 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2389 cdoc = xml.etree.ElementTree.fromstring(configXml)
2391 for rendition in cdoc.findall('.//rendition'):
2392 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2396 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2399 if self._downloader.params.get('listformats', None):
2400 self._print_formats([i[0] for i in turls])
2403 # For now, just pick the highest bitrate
2404 format,video_url = turls[-1]
2406 # Get the format arg from the arg stream
2407 req_format = self._downloader.params.get('format', None)
2409 # Select format if we can find one
2412 format, video_url = f, v
2415 # Patch to download from alternative CDN, which does not
2416 # break on current RTMPDump builds
2417 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2418 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2420 if video_url.startswith(broken_cdn):
2421 video_url = video_url.replace(broken_cdn, better_cdn)
2423 effTitle = showId + u'-' + epTitle
2428 'upload_date': officialDate,
2433 'description': officialTitle,
2434 'player_url': None #playerUrl
2437 results.append(info)
2442 class EscapistIE(InfoExtractor):
2443 """Information extractor for The Escapist """
2445 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2446 IE_NAME = u'escapist'
2448 def report_extraction(self, showName):
2449 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2451 def report_config_download(self, showName):
2452 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2454 def _real_extract(self, url):
2455 mobj = re.match(self._VALID_URL, url)
2457 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2459 showName = mobj.group('showname')
2460 videoId = mobj.group('episode')
2462 self.report_extraction(showName)
2464 webPage = urllib2.urlopen(url)
2465 webPageBytes = webPage.read()
2466 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2467 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2472 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2473 description = unescapeHTML(descMatch.group(1))
2474 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2475 imgUrl = unescapeHTML(imgMatch.group(1))
2476 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2477 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2478 configUrlMatch = re.search('config=(.*)$', playerUrl)
2479 configUrl = urllib2.unquote(configUrlMatch.group(1))
2481 self.report_config_download(showName)
2483 configJSON = urllib2.urlopen(configUrl).read()
2484 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2485 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2488 # Technically, it's JavaScript, not JSON
2489 configJSON = configJSON.replace("'", '"')
2492 config = json.loads(configJSON)
2493 except (ValueError,), err:
2494 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2497 playlist = config['playlist']
2498 videoUrl = playlist[1]['url']
2503 'uploader': showName,
2504 'upload_date': u'NA',
2507 'thumbnail': imgUrl,
2508 'description': description,
2509 'player_url': playerUrl,
2515 class CollegeHumorIE(InfoExtractor):
2516 """Information extractor for collegehumor.com"""
2518 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2519 IE_NAME = u'collegehumor'
2521 def report_webpage(self, video_id):
2522 """Report information extraction."""
2523 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2525 def report_extraction(self, video_id):
2526 """Report information extraction."""
2527 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2529 def _real_extract(self, url):
2530 mobj = re.match(self._VALID_URL, url)
2532 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2534 video_id = mobj.group('videoid')
2536 self.report_webpage(video_id)
2537 request = urllib2.Request(url)
2539 webpage = urllib2.urlopen(request).read()
2540 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2541 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2544 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2546 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2548 internal_video_id = m.group('internalvideoid')
2552 'internal_id': internal_video_id,
2554 'upload_date': u'NA',
2557 self.report_extraction(video_id)
2558 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2560 metaXml = urllib2.urlopen(xmlUrl).read()
2561 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2562 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2565 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2567 videoNode = mdoc.findall('./video')[0]
2568 info['description'] = videoNode.findall('./description')[0].text
2569 info['title'] = videoNode.findall('./caption')[0].text
2570 info['url'] = videoNode.findall('./file')[0].text
2571 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2572 info['ext'] = info['url'].rpartition('.')[2]
2574 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2580 class XVideosIE(InfoExtractor):
2581 """Information extractor for xvideos.com"""
2583 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2584 IE_NAME = u'xvideos'
2586 def report_webpage(self, video_id):
2587 """Report information extraction."""
2588 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2590 def report_extraction(self, video_id):
2591 """Report information extraction."""
2592 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2594 def _real_extract(self, url):
2595 mobj = re.match(self._VALID_URL, url)
2597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2599 video_id = mobj.group(1).decode('utf-8')
2601 self.report_webpage(video_id)
2603 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2605 webpage = urllib2.urlopen(request).read()
2606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2607 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2610 self.report_extraction(video_id)
2614 mobj = re.search(r'flv_url=(.+?)&', webpage)
2616 self._downloader.trouble(u'ERROR: unable to extract video url')
2618 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2622 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2624 self._downloader.trouble(u'ERROR: unable to extract video title')
2626 video_title = mobj.group(1).decode('utf-8')
2629 # Extract video thumbnail
2630 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2632 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2634 video_thumbnail = mobj.group(0).decode('utf-8')
2640 'upload_date': u'NA',
2641 'title': video_title,
2643 'thumbnail': video_thumbnail,
2644 'description': None,
2650 class SoundcloudIE(InfoExtractor):
2651 """Information extractor for soundcloud.com
2652 To access the media, the uid of the song and a stream token
2653 must be extracted from the page source and the script must make
2654 a request to media.soundcloud.com/crossdomain.xml. Then
2655 the media can be grabbed by requesting from an url composed
2656 of the stream token and uid
2659 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2660 IE_NAME = u'soundcloud'
2662 def __init__(self, downloader=None):
2663 InfoExtractor.__init__(self, downloader)
2665 def report_webpage(self, video_id):
2666 """Report information extraction."""
2667 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2669 def report_extraction(self, video_id):
2670 """Report information extraction."""
2671 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2673 def _real_extract(self, url):
2674 mobj = re.match(self._VALID_URL, url)
2676 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2679 # extract uploader (which is in the url)
2680 uploader = mobj.group(1).decode('utf-8')
2681 # extract simple title (uploader + slug of song title)
2682 slug_title = mobj.group(2).decode('utf-8')
2683 simple_title = uploader + u'-' + slug_title
2685 self.report_webpage('%s/%s' % (uploader, slug_title))
2687 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2689 webpage = urllib2.urlopen(request).read()
2690 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2691 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2694 self.report_extraction('%s/%s' % (uploader, slug_title))
2696 # extract uid and stream token that soundcloud hands out for access
2697 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2699 video_id = mobj.group(1)
2700 stream_token = mobj.group(2)
2702 # extract unsimplified title
2703 mobj = re.search('"title":"(.*?)",', webpage)
2705 title = mobj.group(1).decode('utf-8')
2707 title = simple_title
2709 # construct media url (with uid/token)
2710 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2711 mediaURL = mediaURL % (video_id, stream_token)
2714 description = u'No description available'
2715 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2717 description = mobj.group(1)
2721 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2724 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2725 except Exception, e:
2726 self._downloader.to_stderr(compat_str(e))
2728 # for soundcloud, a request to a cross domain is required for cookies
2729 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2732 'id': video_id.decode('utf-8'),
2734 'uploader': uploader.decode('utf-8'),
2735 'upload_date': upload_date,
2738 'description': description.decode('utf-8')
2742 class InfoQIE(InfoExtractor):
2743 """Information extractor for infoq.com"""
2745 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2748 def report_webpage(self, video_id):
2749 """Report information extraction."""
2750 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2752 def report_extraction(self, video_id):
2753 """Report information extraction."""
2754 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2756 def _real_extract(self, url):
2757 mobj = re.match(self._VALID_URL, url)
2759 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2762 self.report_webpage(url)
2764 request = urllib2.Request(url)
2766 webpage = urllib2.urlopen(request).read()
2767 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2768 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2771 self.report_extraction(url)
2775 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2777 self._downloader.trouble(u'ERROR: unable to extract video url')
2779 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2783 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2785 self._downloader.trouble(u'ERROR: unable to extract video title')
2787 video_title = mobj.group(1).decode('utf-8')
2789 # Extract description
2790 video_description = u'No description available.'
2791 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2792 if mobj is not None:
2793 video_description = mobj.group(1).decode('utf-8')
2795 video_filename = video_url.split('/')[-1]
2796 video_id, extension = video_filename.split('.')
2802 'upload_date': u'NA',
2803 'title': video_title,
2804 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2806 'description': video_description,
2811 class MixcloudIE(InfoExtractor):
2812 """Information extractor for www.mixcloud.com"""
2813 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2814 IE_NAME = u'mixcloud'
2816 def __init__(self, downloader=None):
2817 InfoExtractor.__init__(self, downloader)
2819 def report_download_json(self, file_id):
2820 """Report JSON download."""
2821 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2823 def report_extraction(self, file_id):
2824 """Report information extraction."""
2825 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2827 def get_urls(self, jsonData, fmt, bitrate='best'):
2828 """Get urls from 'audio_formats' section in json"""
2831 bitrate_list = jsonData[fmt]
2832 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2833 bitrate = max(bitrate_list) # select highest
2835 url_list = jsonData[fmt][bitrate]
2836 except TypeError: # we have no bitrate info.
2837 url_list = jsonData[fmt]
2840 def check_urls(self, url_list):
2841 """Returns 1st active url from list"""
2842 for url in url_list:
2844 urllib2.urlopen(url)
2846 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2851 def _print_formats(self, formats):
2852 print('Available formats:')
2853 for fmt in formats.keys():
2854 for b in formats[fmt]:
2856 ext = formats[fmt][b][0]
2857 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2858 except TypeError: # we have no bitrate info
2859 ext = formats[fmt][0]
2860 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2863 def _real_extract(self, url):
2864 mobj = re.match(self._VALID_URL, url)
2866 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2868 # extract uploader & filename from url
2869 uploader = mobj.group(1).decode('utf-8')
2870 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2872 # construct API request
2873 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2874 # retrieve .json file with links to files
2875 request = urllib2.Request(file_url)
2877 self.report_download_json(file_url)
2878 jsonData = urllib2.urlopen(request).read()
2879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2880 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2884 json_data = json.loads(jsonData)
2885 player_url = json_data['player_swf_url']
2886 formats = dict(json_data['audio_formats'])
2888 req_format = self._downloader.params.get('format', None)
2891 if self._downloader.params.get('listformats', None):
2892 self._print_formats(formats)
2895 if req_format is None or req_format == 'best':
2896 for format_param in formats.keys():
2897 url_list = self.get_urls(formats, format_param)
2899 file_url = self.check_urls(url_list)
2900 if file_url is not None:
2903 if req_format not in formats.keys():
2904 self._downloader.trouble(u'ERROR: format is not available')
2907 url_list = self.get_urls(formats, req_format)
2908 file_url = self.check_urls(url_list)
2909 format_param = req_format
2912 'id': file_id.decode('utf-8'),
2913 'url': file_url.decode('utf-8'),
2914 'uploader': uploader.decode('utf-8'),
2915 'upload_date': u'NA',
2916 'title': json_data['name'],
2917 'ext': file_url.split('.')[-1].decode('utf-8'),
2918 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2919 'thumbnail': json_data['thumbnail_url'],
2920 'description': json_data['description'],
2921 'player_url': player_url.decode('utf-8'),
2924 class StanfordOpenClassroomIE(InfoExtractor):
2925 """Information extractor for Stanford's Open ClassRoom"""
2927 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2928 IE_NAME = u'stanfordoc'
2930 def report_download_webpage(self, objid):
2931 """Report information extraction."""
2932 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2934 def report_extraction(self, video_id):
2935 """Report information extraction."""
2936 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2938 def _real_extract(self, url):
2939 mobj = re.match(self._VALID_URL, url)
2941 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2944 if mobj.group('course') and mobj.group('video'): # A specific video
2945 course = mobj.group('course')
2946 video = mobj.group('video')
2948 'id': course + '_' + video,
2950 'upload_date': u'NA',
2953 self.report_extraction(info['id'])
2954 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2955 xmlUrl = baseUrl + video + '.xml'
2957 metaXml = urllib2.urlopen(xmlUrl).read()
2958 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2959 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2961 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2963 info['title'] = mdoc.findall('./title')[0].text
2964 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2966 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2968 info['ext'] = info['url'].rpartition('.')[2]
2970 elif mobj.group('course'): # A course page
2971 course = mobj.group('course')
2976 'upload_date': u'NA',
2979 self.report_download_webpage(info['id'])
2981 coursepage = urllib2.urlopen(url).read()
2982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2983 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2986 m = re.search('<h1>([^<]+)</h1>', coursepage)
2988 info['title'] = unescapeHTML(m.group(1))
2990 info['title'] = info['id']
2992 m = re.search('<description>([^<]+)</description>', coursepage)
2994 info['description'] = unescapeHTML(m.group(1))
2996 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2999 'type': 'reference',
3000 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3004 for entry in info['list']:
3005 assert entry['type'] == 'reference'
3006 results += self.extract(entry['url'])
3011 'id': 'Stanford OpenClassroom',
3014 'upload_date': u'NA',
3017 self.report_download_webpage(info['id'])
3018 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3020 rootpage = urllib2.urlopen(rootURL).read()
3021 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3022 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3025 info['title'] = info['id']
3027 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3030 'type': 'reference',
3031 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3036 for entry in info['list']:
3037 assert entry['type'] == 'reference'
3038 results += self.extract(entry['url'])
3041 class MTVIE(InfoExtractor):
3042 """Information extractor for MTV.com"""
3044 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3047 def report_webpage(self, video_id):
3048 """Report information extraction."""
3049 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3051 def report_extraction(self, video_id):
3052 """Report information extraction."""
3053 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3055 def _real_extract(self, url):
3056 mobj = re.match(self._VALID_URL, url)
3058 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3060 if not mobj.group('proto'):
3061 url = 'http://' + url
3062 video_id = mobj.group('videoid')
3063 self.report_webpage(video_id)
3065 request = urllib2.Request(url)
3067 webpage = urllib2.urlopen(request).read()
3068 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3069 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3072 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3074 self._downloader.trouble(u'ERROR: unable to extract song name')
3076 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3077 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3079 self._downloader.trouble(u'ERROR: unable to extract performer')
3081 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3082 video_title = performer + ' - ' + song_name
3084 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3086 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3088 mtvn_uri = mobj.group(1)
3090 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3092 self._downloader.trouble(u'ERROR: unable to extract content id')
3094 content_id = mobj.group(1)
3096 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3097 self.report_extraction(video_id)
3098 request = urllib2.Request(videogen_url)
3100 metadataXml = urllib2.urlopen(request).read()
3101 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3102 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3105 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3106 renditions = mdoc.findall('.//rendition')
3108 # For now, always pick the highest quality.
3109 rendition = renditions[-1]
3112 _,_,ext = rendition.attrib['type'].partition('/')
3113 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3114 video_url = rendition.find('./src').text
3116 self._downloader.trouble('Invalid rendition field.')
3122 'uploader': performer,
3123 'upload_date': u'NA',
3124 'title': video_title,
3132 class YoukuIE(InfoExtractor):
3134 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3137 def __init__(self, downloader=None):
3138 InfoExtractor.__init__(self, downloader)
3140 def report_download_webpage(self, file_id):
3141 """Report webpage download."""
3142 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3144 def report_extraction(self, file_id):
3145 """Report information extraction."""
3146 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3149 nowTime = int(time.time() * 1000)
3150 random1 = random.randint(1000,1998)
3151 random2 = random.randint(1000,9999)
3153 return "%d%d%d" %(nowTime,random1,random2)
3155 def _get_file_ID_mix_string(self, seed):
3157 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3159 for i in range(len(source)):
3160 seed = (seed * 211 + 30031 ) % 65536
3161 index = math.floor(seed / 65536 * len(source) )
3162 mixed.append(source[int(index)])
3163 source.remove(source[int(index)])
3164 #return ''.join(mixed)
3167 def _get_file_id(self, fileId, seed):
3168 mixed = self._get_file_ID_mix_string(seed)
3169 ids = fileId.split('*')
3173 realId.append(mixed[int(ch)])
3174 return ''.join(realId)
3176 def _real_extract(self, url):
3177 mobj = re.match(self._VALID_URL, url)
3179 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3181 video_id = mobj.group('ID')
3183 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3185 request = urllib2.Request(info_url, None, std_headers)
3187 self.report_download_webpage(video_id)
3188 jsondata = urllib2.urlopen(request).read()
3189 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3190 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3193 self.report_extraction(video_id)
3195 config = json.loads(jsondata)
3197 video_title = config['data'][0]['title']
3198 seed = config['data'][0]['seed']
3200 format = self._downloader.params.get('format', None)
3201 supported_format = config['data'][0]['streamfileids'].keys()
3203 if format is None or format == 'best':
3204 if 'hd2' in supported_format:
3209 elif format == 'worst':
3217 fileid = config['data'][0]['streamfileids'][format]
3218 seg_number = len(config['data'][0]['segs'][format])
3221 for i in xrange(seg_number):
3222 keys.append(config['data'][0]['segs'][format][i]['k'])
3225 #youku only could be viewed from mainland china
3227 self._downloader.trouble(u'ERROR: unable to extract info section')
3231 sid = self._gen_sid()
3232 fileid = self._get_file_id(fileid, seed)
3234 #column 8,9 of fileid represent the segment number
3235 #fileid[7:9] should be changed
3236 for index, key in enumerate(keys):
3238 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3239 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3242 'id': '%s_part%02d' % (video_id, index),
3243 'url': download_url,
3245 'upload_date': u'NA',
3246 'title': video_title,
3249 files_info.append(info)
3254 class XNXXIE(InfoExtractor):
3255 """Information extractor for xnxx.com"""
3257 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3259 VIDEO_URL_RE = r'flv_url=(.*?)&'
3260 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3261 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3263 def report_webpage(self, video_id):
3264 """Report information extraction"""
3265 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3267 def report_extraction(self, video_id):
3268 """Report information extraction"""
3269 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3271 def _real_extract(self, url):
3272 mobj = re.match(self._VALID_URL, url)
3274 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3276 video_id = mobj.group(1).decode('utf-8')
3278 self.report_webpage(video_id)
3280 # Get webpage content
3282 webpage = urllib2.urlopen(url).read()
3283 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3284 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3287 result = re.search(self.VIDEO_URL_RE, webpage)
3289 self._downloader.trouble(u'ERROR: unable to extract video url')
3291 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3293 result = re.search(self.VIDEO_TITLE_RE, webpage)
3295 self._downloader.trouble(u'ERROR: unable to extract video title')
3297 video_title = result.group(1).decode('utf-8')
3299 result = re.search(self.VIDEO_THUMB_RE, webpage)
3301 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3303 video_thumbnail = result.group(1).decode('utf-8')
3309 'upload_date': u'NA',
3310 'title': video_title,
3312 'thumbnail': video_thumbnail,
3313 'description': None,
3317 class GooglePlusIE(InfoExtractor):
3318 """Information extractor for plus.google.com."""
3320 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3321 IE_NAME = u'plus.google'
3323 def __init__(self, downloader=None):
3324 InfoExtractor.__init__(self, downloader)
3326 def report_extract_entry(self, url):
3327 """Report downloading extry"""
3328 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3330 def report_date(self, upload_date):
3331 """Report downloading extry"""
3332 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3334 def report_uploader(self, uploader):
3335 """Report downloading extry"""
3336 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3338 def report_title(self, video_title):
3339 """Report downloading extry"""
3340 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3342 def report_extract_vid_page(self, video_page):
3343 """Report information extraction."""
3344 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3346 def _real_extract(self, url):
3347 # Extract id from URL
3348 mobj = re.match(self._VALID_URL, url)
3350 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3353 post_url = mobj.group(0)
3354 video_id = mobj.group(2)
3356 video_extension = 'flv'
3358 # Step 1, Retrieve post webpage to extract further information
3359 self.report_extract_entry(post_url)
3360 request = urllib2.Request(post_url)
3362 webpage = urllib2.urlopen(request).read()
3363 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3364 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3367 # Extract update date
3369 pattern = 'title="Timestamp">(.*?)</a>'
3370 mobj = re.search(pattern, webpage)
3372 upload_date = mobj.group(1)
3373 # Convert timestring to a format suitable for filename
3374 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3375 upload_date = upload_date.strftime('%Y%m%d')
3376 self.report_date(upload_date)
3380 pattern = r'rel\="author".*?>(.*?)</a>'
3381 mobj = re.search(pattern, webpage)
3383 uploader = mobj.group(1)
3384 self.report_uploader(uploader)
3387 # Get the first line for title
3389 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3390 mobj = re.search(pattern, webpage)
3392 video_title = mobj.group(1)
3393 self.report_title(video_title)
3395 # Step 2, Stimulate clicking the image box to launch video
3396 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3397 mobj = re.search(pattern, webpage)
3399 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3401 video_page = mobj.group(1)
3402 request = urllib2.Request(video_page)
3404 webpage = urllib2.urlopen(request).read()
3405 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3406 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3408 self.report_extract_vid_page(video_page)
3411 # Extract video links on video page
3412 """Extract video links of all sizes"""
3413 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3414 mobj = re.findall(pattern, webpage)
3416 self._downloader.trouble(u'ERROR: unable to extract video links')
3418 # Sort in resolution
3419 links = sorted(mobj)
3421 # Choose the lowest of the sort, i.e. highest resolution
3422 video_url = links[-1]
3423 # Only get the url. The resolution part in the tuple has no use anymore
3424 video_url = video_url[-1]
3425 # Treat escaped \u0026 style hex
3426 video_url = unicode(video_url, "unicode_escape")
3430 'id': video_id.decode('utf-8'),
3432 'uploader': uploader.decode('utf-8'),
3433 'upload_date': upload_date.decode('utf-8'),
3434 'title': video_title.decode('utf-8'),
3435 'ext': video_extension.decode('utf-8'),