2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
138 #Methods for following #608
139 #They set the correct value of the '_type' key
140 def video_result(self, video_info):
141 """Returns a video"""
142 video_info['_type'] = 'video'
144 def url_result(self, url, ie=None):
145 """Returns a url that points to a page that should be processed"""
146 #TODO: ie should be the class used for getting the info
147 video_info = {'_type': 'url',
150 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
151 """Returns a playlist"""
152 video_info = {'_type': 'playlist',
155 video_info['id'] = playlist_id
157 video_info['title'] = playlist_title
161 class YoutubeIE(InfoExtractor):
162 """Information extractor for youtube.com."""
166 (?:https?://)? # http(s):// (optional)
167 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
168 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
169 (?:.*?\#/)? # handle anchor (#/) redirect urls
170 (?: # the various things that can precede the ID:
171 (?:(?:v|embed|e)/) # v/ or embed/ or e/
172 |(?: # or the v= param in all its forms
173 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
174 (?:\?|\#!?) # the params delimiter ? or # or #!
175 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
178 )? # optional -> youtube.com/xxxx is OK
179 )? # all until now is optional -> you can pass the naked ID
180 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
181 (?(1).+)? # if we found the ID, everything can follow
183 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
184 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
185 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
186 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
187 _NETRC_MACHINE = 'youtube'
188 # Listed in order of quality
189 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
190 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
191 _video_extensions = {
197 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
203 _video_dimensions = {
222 def suitable(cls, url):
223 """Receives a URL and returns True if suitable for this IE."""
224 if YoutubePlaylistIE.suitable(url): return False
225 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
227 def report_lang(self):
228 """Report attempt to set language."""
229 self._downloader.to_screen(u'[youtube] Setting language')
231 def report_login(self):
232 """Report attempt to log in."""
233 self._downloader.to_screen(u'[youtube] Logging in')
235 def report_age_confirmation(self):
236 """Report attempt to confirm age."""
237 self._downloader.to_screen(u'[youtube] Confirming age')
239 def report_video_webpage_download(self, video_id):
240 """Report attempt to download video webpage."""
241 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
243 def report_video_info_webpage_download(self, video_id):
244 """Report attempt to download video info webpage."""
245 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
247 def report_video_subtitles_download(self, video_id):
248 """Report attempt to download video info webpage."""
249 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
251 def report_video_subtitles_request(self, video_id, sub_lang, format):
252 """Report attempt to download video info webpage."""
253 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
255 def report_video_subtitles_available(self, video_id, sub_lang_list):
256 """Report available subtitles."""
257 sub_lang = ",".join(list(sub_lang_list.keys()))
258 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
260 def report_information_extraction(self, video_id):
261 """Report attempt to extract video information."""
262 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
264 def report_unavailable_format(self, video_id, format):
265 """Report extracted video URL."""
266 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
268 def report_rtmp_download(self):
269 """Indicate the download will use the RTMP protocol."""
270 self._downloader.to_screen(u'[youtube] RTMP download detected')
272 def _get_available_subtitles(self, video_id):
273 self.report_video_subtitles_download(video_id)
274 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
276 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
278 return (u'unable to download video subtitles: %s' % compat_str(err), None)
279 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
280 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
281 if not sub_lang_list:
282 return (u'video doesn\'t have subtitles', None)
285 def _list_available_subtitles(self, video_id):
286 sub_lang_list = self._get_available_subtitles(video_id)
287 self.report_video_subtitles_available(video_id, sub_lang_list)
289 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
292 (error_message, sub_lang, sub)
294 self.report_video_subtitles_request(video_id, sub_lang, format)
295 params = compat_urllib_parse.urlencode({
301 url = 'http://www.youtube.com/api/timedtext?' + params
303 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
307 return (u'Did not fetch video subtitles', None, None)
308 return (None, sub_lang, sub)
310 def _extract_subtitle(self, video_id):
312 Return a list with a tuple:
313 [(error_message, sub_lang, sub)]
315 sub_lang_list = self._get_available_subtitles(video_id)
316 sub_format = self._downloader.params.get('subtitlesformat')
317 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
318 return [(sub_lang_list[0], None, None)]
319 if self._downloader.params.get('subtitleslang', False):
320 sub_lang = self._downloader.params.get('subtitleslang')
321 elif 'en' in sub_lang_list:
324 sub_lang = list(sub_lang_list.keys())[0]
325 if not sub_lang in sub_lang_list:
326 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
328 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
331 def _extract_all_subtitles(self, video_id):
332 sub_lang_list = self._get_available_subtitles(video_id)
333 sub_format = self._downloader.params.get('subtitlesformat')
334 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
335 return [(sub_lang_list[0], None, None)]
337 for sub_lang in sub_lang_list:
338 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
339 subtitles.append(subtitle)
342 def _print_formats(self, formats):
343 print('Available formats:')
345 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
347 def _real_initialize(self):
348 if self._downloader is None:
353 downloader_params = self._downloader.params
355 # Attempt to use provided username and password or .netrc data
356 if downloader_params.get('username', None) is not None:
357 username = downloader_params['username']
358 password = downloader_params['password']
359 elif downloader_params.get('usenetrc', False):
361 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
366 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367 except (IOError, netrc.NetrcParseError) as err:
368 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
372 request = compat_urllib_request.Request(self._LANG_URL)
375 compat_urllib_request.urlopen(request).read()
376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
380 # No authentication to be performed
384 request = compat_urllib_request.Request(self._LOGIN_URL)
386 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
393 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
395 galx = match.group(1)
397 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
403 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
407 u'PersistentCookie': u'yes',
409 u'bgresponse': u'js_disabled',
410 u'checkConnection': u'',
411 u'checkedDomains': u'youtube',
417 u'signIn': u'Sign in',
419 u'service': u'youtube',
423 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
425 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
426 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
427 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
430 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
431 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
432 self._downloader.report_warning(u'unable to log in: bad username or password')
434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
441 'action_confirm': 'Confirm',
443 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
445 self.report_age_confirmation()
446 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
448 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
451 def _extract_id(self, url):
452 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
454 self._downloader.report_error(u'invalid URL: %s' % url)
456 video_id = mobj.group(2)
459 def _real_extract(self, url):
460 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
461 mobj = re.search(self._NEXT_URL_RE, url)
463 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
464 video_id = self._extract_id(url)
467 self.report_video_webpage_download(video_id)
468 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
469 request = compat_urllib_request.Request(url)
471 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
473 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
476 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
478 # Attempt to extract SWF player URL
479 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
481 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
486 self.report_video_info_webpage_download(video_id)
487 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
488 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
489 % (video_id, el_type))
490 request = compat_urllib_request.Request(video_info_url)
492 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
493 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
494 video_info = compat_parse_qs(video_info_webpage)
495 if 'token' in video_info:
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
500 if 'token' not in video_info:
501 if 'reason' in video_info:
502 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
504 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
507 # Check for "rental" videos
508 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
509 self._downloader.report_error(u'"rental" videos not supported')
512 # Start extracting information
513 self.report_information_extraction(video_id)
516 if 'author' not in video_info:
517 self._downloader.report_error(u'unable to extract uploader name')
519 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
522 video_uploader_id = None
523 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
525 video_uploader_id = mobj.group(1)
527 self._downloader.report_warning(u'unable to extract uploader nickname')
530 if 'title' not in video_info:
531 self._downloader.report_error(u'unable to extract video title')
533 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
536 if 'thumbnail_url' not in video_info:
537 self._downloader.report_warning(u'unable to extract video thumbnail')
539 else: # don't panic if we can't find it
540 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
544 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
546 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
547 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
548 for expression in format_expressions:
550 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
555 video_description = get_element_by_id("eow-description", video_webpage)
556 if video_description:
557 video_description = clean_html(video_description)
559 video_description = ''
562 video_subtitles = None
564 if self._downloader.params.get('writesubtitles', False):
565 video_subtitles = self._extract_subtitle(video_id)
567 (sub_error, sub_lang, sub) = video_subtitles[0]
569 self._downloader.report_error(sub_error)
571 if self._downloader.params.get('allsubtitles', False):
572 video_subtitles = self._extract_all_subtitles(video_id)
573 for video_subtitle in video_subtitles:
574 (sub_error, sub_lang, sub) = video_subtitle
576 self._downloader.report_error(sub_error)
578 if self._downloader.params.get('listsubtitles', False):
579 sub_lang_list = self._list_available_subtitles(video_id)
582 if 'length_seconds' not in video_info:
583 self._downloader.report_warning(u'unable to extract video duration')
586 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
589 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
591 # Decide which formats to download
592 req_format = self._downloader.params.get('format', None)
594 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
595 self.report_rtmp_download()
596 video_url_list = [(None, video_info['conn'][0])]
597 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
598 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
599 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
600 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
601 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
603 format_limit = self._downloader.params.get('format_limit', None)
604 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
605 if format_limit is not None and format_limit in available_formats:
606 format_list = available_formats[available_formats.index(format_limit):]
608 format_list = available_formats
609 existing_formats = [x for x in format_list if x in url_map]
610 if len(existing_formats) == 0:
611 self._downloader.report_error(u'no known formats available for video')
613 if self._downloader.params.get('listformats', None):
614 self._print_formats(existing_formats)
616 if req_format is None or req_format == 'best':
617 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
618 elif req_format == 'worst':
619 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
620 elif req_format in ('-1', 'all'):
621 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
623 # Specific formats. We pick the first in a slash-delimeted sequence.
624 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
625 req_formats = req_format.split('/')
626 video_url_list = None
627 for rf in req_formats:
629 video_url_list = [(rf, url_map[rf])]
631 if video_url_list is None:
632 self._downloader.report_error(u'requested format not available')
635 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
639 for format_param, video_real_url in video_url_list:
641 video_extension = self._video_extensions.get(format_param, 'flv')
643 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
644 self._video_dimensions.get(format_param, '???'))
648 'url': video_real_url,
649 'uploader': video_uploader,
650 'uploader_id': video_uploader_id,
651 'upload_date': upload_date,
652 'title': video_title,
653 'ext': video_extension,
654 'format': video_format,
655 'thumbnail': video_thumbnail,
656 'description': video_description,
657 'player_url': player_url,
658 'subtitles': video_subtitles,
659 'duration': video_duration
664 class MetacafeIE(InfoExtractor):
665 """Information Extractor for metacafe.com."""
667 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
668 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
669 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
670 IE_NAME = u'metacafe'
672 def __init__(self, downloader=None):
673 InfoExtractor.__init__(self, downloader)
675 def report_disclaimer(self):
676 """Report disclaimer retrieval."""
677 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
679 def report_age_confirmation(self):
680 """Report attempt to confirm age."""
681 self._downloader.to_screen(u'[metacafe] Confirming age')
683 def report_download_webpage(self, video_id):
684 """Report webpage download."""
685 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
687 def report_extraction(self, video_id):
688 """Report information extraction."""
689 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
704 'submit': "Continue - I'm over 18",
706 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
708 self.report_age_confirmation()
709 disclaimer = compat_urllib_request.urlopen(request).read()
710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
711 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
714 def _real_extract(self, url):
715 # Extract id and simplified title from URL
716 mobj = re.match(self._VALID_URL, url)
718 self._downloader.report_error(u'invalid URL: %s' % url)
721 video_id = mobj.group(1)
723 # Check if video comes from YouTube
724 mobj2 = re.match(r'^yt-(.*)$', video_id)
725 if mobj2 is not None:
726 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
728 # Retrieve video webpage to extract further information
729 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
731 self.report_download_webpage(video_id)
732 webpage = compat_urllib_request.urlopen(request).read()
733 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
734 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
737 # Extract URL, uploader and title from webpage
738 self.report_extraction(video_id)
739 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742 video_extension = mediaURL[-3:]
744 # Extract gdaKey if available
745 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
749 gdaKey = mobj.group(1)
750 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754 self._downloader.report_error(u'unable to extract media URL')
756 vardict = compat_parse_qs(mobj.group(1))
757 if 'mediaData' not in vardict:
758 self._downloader.report_error(u'unable to extract media URL')
760 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
762 self._downloader.report_error(u'unable to extract media URL')
764 mediaURL = mobj.group(1).replace('\\/', '/')
765 video_extension = mediaURL[-3:]
766 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
768 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770 self._downloader.report_error(u'unable to extract title')
772 video_title = mobj.group(1).decode('utf-8')
774 mobj = re.search(r'submitter=(.*?);', webpage)
776 self._downloader.report_error(u'unable to extract uploader nickname')
778 video_uploader = mobj.group(1)
781 'id': video_id.decode('utf-8'),
782 'url': video_url.decode('utf-8'),
783 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'ext': video_extension.decode('utf-8'),
790 class DailymotionIE(InfoExtractor):
791 """Information Extractor for Dailymotion"""
793 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794 IE_NAME = u'dailymotion'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
804 def _real_extract(self, url):
805 # Extract id and simplified title from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.report_error(u'invalid URL: %s' % url)
811 video_id = mobj.group(1).split('_')[0].split('?')[0]
813 video_extension = 'mp4'
815 # Retrieve video webpage to extract further information
816 request = compat_urllib_request.Request(url)
817 request.add_header('Cookie', 'family_filter=off')
818 webpage = self._download_webpage(request, video_id)
820 # Extract URL, uploader and title from webpage
821 self.report_extraction(video_id)
822 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824 self._downloader.report_error(u'unable to extract media URL')
826 flashvars = compat_urllib_parse.unquote(mobj.group(1))
828 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
831 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
834 self._downloader.report_error(u'unable to extract video URL')
837 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839 self._downloader.report_error(u'unable to extract video URL')
842 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844 # TODO: support choosing qualities
846 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848 self._downloader.report_error(u'unable to extract title')
850 video_title = unescapeHTML(mobj.group('title'))
852 video_uploader = None
853 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855 # lookin for official user
856 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857 if mobj_official is None:
858 self._downloader.report_warning(u'unable to extract uploader nickname')
860 video_uploader = mobj_official.group(1)
862 video_uploader = mobj.group(1)
864 video_upload_date = None
865 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872 'uploader': video_uploader,
873 'upload_date': video_upload_date,
874 'title': video_title,
875 'ext': video_extension,
879 class PhotobucketIE(InfoExtractor):
880 """Information extractor for photobucket.com."""
882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883 IE_NAME = u'photobucket'
885 def __init__(self, downloader=None):
886 InfoExtractor.__init__(self, downloader)
888 def report_download_webpage(self, video_id):
889 """Report webpage download."""
890 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
892 def report_extraction(self, video_id):
893 """Report information extraction."""
894 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
896 def _real_extract(self, url):
897 # Extract id from URL
898 mobj = re.match(self._VALID_URL, url)
900 self._downloader.report_error(u'Invalid URL: %s' % url)
903 video_id = mobj.group(1)
905 video_extension = 'flv'
907 # Retrieve video webpage to extract further information
908 request = compat_urllib_request.Request(url)
910 self.report_download_webpage(video_id)
911 webpage = compat_urllib_request.urlopen(request).read()
912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
916 # Extract URL, uploader, and title from webpage
917 self.report_extraction(video_id)
918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
920 self._downloader.report_error(u'unable to extract media URL')
922 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
928 self._downloader.report_error(u'unable to extract title')
930 video_title = mobj.group(1).decode('utf-8')
932 video_uploader = mobj.group(2).decode('utf-8')
935 'id': video_id.decode('utf-8'),
936 'url': video_url.decode('utf-8'),
937 'uploader': video_uploader,
939 'title': video_title,
940 'ext': video_extension.decode('utf-8'),
944 class YahooIE(InfoExtractor):
945 """Information extractor for video.yahoo.com."""
948 # _VALID_URL matches all Yahoo! Video URLs
949 # _VPAGE_URL matches only the extractable '/watch/' URLs
950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952 IE_NAME = u'video.yahoo'
954 def __init__(self, downloader=None):
955 InfoExtractor.__init__(self, downloader)
957 def report_download_webpage(self, video_id):
958 """Report webpage download."""
959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
961 def report_extraction(self, video_id):
962 """Report information extraction."""
963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
965 def _real_extract(self, url, new_video=True):
966 # Extract ID from URL
967 mobj = re.match(self._VALID_URL, url)
969 self._downloader.report_error(u'Invalid URL: %s' % url)
972 video_id = mobj.group(2)
973 video_extension = 'flv'
975 # Rewrite valid but non-extractable URLs as
976 # extractable English language /watch/ URLs
977 if re.match(self._VPAGE_URL, url) is None:
978 request = compat_urllib_request.Request(url)
980 webpage = compat_urllib_request.urlopen(request).read()
981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
985 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
987 self._downloader.report_error(u'Unable to extract id field')
989 yahoo_id = mobj.group(1)
991 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
993 self._downloader.report_error(u'Unable to extract vid field')
995 yahoo_vid = mobj.group(1)
997 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998 return self._real_extract(url, new_video=False)
1000 # Retrieve video webpage to extract further information
1001 request = compat_urllib_request.Request(url)
1003 self.report_download_webpage(video_id)
1004 webpage = compat_urllib_request.urlopen(request).read()
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract uploader and title from webpage
1010 self.report_extraction(video_id)
1011 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1013 self._downloader.report_error(u'unable to extract video title')
1015 video_title = mobj.group(1).decode('utf-8')
1017 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1019 self._downloader.report_error(u'unable to extract video uploader')
1021 video_uploader = mobj.group(1).decode('utf-8')
1023 # Extract video thumbnail
1024 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video thumbnail')
1028 video_thumbnail = mobj.group(1).decode('utf-8')
1030 # Extract video description
1031 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1033 self._downloader.report_error(u'unable to extract video description')
1035 video_description = mobj.group(1).decode('utf-8')
1036 if not video_description:
1037 video_description = 'No description available.'
1039 # Extract video height and width
1040 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1042 self._downloader.report_error(u'unable to extract video height')
1044 yv_video_height = mobj.group(1)
1046 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1048 self._downloader.report_error(u'unable to extract video width')
1050 yv_video_width = mobj.group(1)
1052 # Retrieve video playlist to extract media URL
1053 # I'm not completely sure what all these options are, but we
1054 # seem to need most of them, otherwise the server sends a 401.
1055 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1056 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1057 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1061 self.report_download_webpage(video_id)
1062 webpage = compat_urllib_request.urlopen(request).read()
1063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1067 # Extract media URL from playlist XML
1068 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1070 self._downloader.report_error(u'Unable to extract media URL')
1072 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073 video_url = unescapeHTML(video_url)
1076 'id': video_id.decode('utf-8'),
1078 'uploader': video_uploader,
1079 'upload_date': None,
1080 'title': video_title,
1081 'ext': video_extension.decode('utf-8'),
1082 'thumbnail': video_thumbnail.decode('utf-8'),
1083 'description': video_description,
1087 class VimeoIE(InfoExtractor):
1088 """Information extractor for vimeo.com."""
1090 # _VALID_URL matches Vimeo URLs
1091 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1105 def _real_extract(self, url, new_video=True):
1106 # Extract ID from URL
1107 mobj = re.match(self._VALID_URL, url)
1109 self._downloader.report_error(u'Invalid URL: %s' % url)
1112 video_id = mobj.group('id')
1113 if not mobj.group('proto'):
1114 url = 'https://' + url
1115 if mobj.group('direct_link'):
1116 url = 'https://vimeo.com/' + video_id
1118 # Retrieve video webpage to extract further information
1119 request = compat_urllib_request.Request(url, None, std_headers)
1121 self.report_download_webpage(video_id)
1122 webpage_bytes = compat_urllib_request.urlopen(request).read()
1123 webpage = webpage_bytes.decode('utf-8')
1124 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1128 # Now we begin extracting as much information as we can from what we
1129 # retrieved. First we extract the information common to all extractors,
1130 # and latter we extract those that are Vimeo specific.
1131 self.report_extraction(video_id)
1133 # Extract the config JSON
1135 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136 config = json.loads(config)
1138 self._downloader.report_error(u'unable to extract info section')
1142 video_title = config["video"]["title"]
1144 # Extract uploader and uploader_id
1145 video_uploader = config["video"]["owner"]["name"]
1146 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1148 # Extract video thumbnail
1149 video_thumbnail = config["video"]["thumbnail"]
1151 # Extract video description
1152 video_description = get_element_by_attribute("itemprop", "description", webpage)
1153 if video_description: video_description = clean_html(video_description)
1154 else: video_description = ''
1156 # Extract upload date
1157 video_upload_date = None
1158 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159 if mobj is not None:
1160 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1162 # Vimeo specific: extract request signature and timestamp
1163 sig = config['request']['signature']
1164 timestamp = config['request']['timestamp']
1166 # Vimeo specific: extract video codec and quality information
1167 # First consider quality, then codecs, then take everything
1168 # TODO bind to format param
1169 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170 files = { 'hd': [], 'sd': [], 'other': []}
1171 for codec_name, codec_extension in codecs:
1172 if codec_name in config["video"]["files"]:
1173 if 'hd' in config["video"]["files"][codec_name]:
1174 files['hd'].append((codec_name, codec_extension, 'hd'))
1175 elif 'sd' in config["video"]["files"][codec_name]:
1176 files['sd'].append((codec_name, codec_extension, 'sd'))
1178 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1180 for quality in ('hd', 'sd', 'other'):
1181 if len(files[quality]) > 0:
1182 video_quality = files[quality][0][2]
1183 video_codec = files[quality][0][0]
1184 video_extension = files[quality][0][1]
1185 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1188 self._downloader.report_error(u'no known codec found')
1191 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1197 'uploader': video_uploader,
1198 'uploader_id': video_uploader_id,
1199 'upload_date': video_upload_date,
1200 'title': video_title,
1201 'ext': video_extension,
1202 'thumbnail': video_thumbnail,
1203 'description': video_description,
1207 class ArteTvIE(InfoExtractor):
1208 """arte.tv information extractor."""
1210 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211 _LIVE_URL = r'index-[0-9]+\.html$'
1213 IE_NAME = u'arte.tv'
1215 def __init__(self, downloader=None):
1216 InfoExtractor.__init__(self, downloader)
1218 def report_download_webpage(self, video_id):
1219 """Report webpage download."""
1220 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1222 def report_extraction(self, video_id):
1223 """Report information extraction."""
1224 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1226 def fetch_webpage(self, url):
1227 request = compat_urllib_request.Request(url)
1229 self.report_download_webpage(url)
1230 webpage = compat_urllib_request.urlopen(request).read()
1231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1234 except ValueError as err:
1235 self._downloader.report_error(u'Invalid URL: %s' % url)
1239 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240 page = self.fetch_webpage(url)
1241 mobj = re.search(regex, page, regexFlags)
1245 self._downloader.report_error(u'Invalid URL: %s' % url)
1248 for (i, key, err) in matchTuples:
1249 if mobj.group(i) is None:
1250 self._downloader.trouble(err)
1253 info[key] = mobj.group(i)
1257 def extractLiveStream(self, url):
1258 video_lang = url.split('/')[-4]
1259 info = self.grep_webpage(
1261 r'src="(.*?/videothek_js.*?\.js)',
1264 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1267 http_host = url.split('/')[2]
1268 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269 info = self.grep_webpage(
1271 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272 '(http://.*?\.swf).*?' +
1276 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1277 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1281 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1283 def extractPlus7Stream(self, url):
1284 video_lang = url.split('/')[-3]
1285 info = self.grep_webpage(
1287 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1290 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1293 next_url = compat_urllib_parse.unquote(info.get('url'))
1294 info = self.grep_webpage(
1296 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1299 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1302 next_url = compat_urllib_parse.unquote(info.get('url'))
1304 info = self.grep_webpage(
1306 r'<video id="(.*?)".*?>.*?' +
1307 '<name>(.*?)</name>.*?' +
1308 '<dateVideo>(.*?)</dateVideo>.*?' +
1309 '<url quality="hd">(.*?)</url>',
1312 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1313 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1315 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1320 'id': info.get('id'),
1321 'url': compat_urllib_parse.unquote(info.get('url')),
1322 'uploader': u'arte.tv',
1323 'upload_date': info.get('date'),
1324 'title': info.get('title').decode('utf-8'),
1330 def _real_extract(self, url):
1331 video_id = url.split('/')[-1]
1332 self.report_extraction(video_id)
1334 if re.search(self._LIVE_URL, video_id) is not None:
1335 self.extractLiveStream(url)
1338 info = self.extractPlus7Stream(url)
1343 class GenericIE(InfoExtractor):
1344 """Generic last-resort information extractor."""
1347 IE_NAME = u'generic'
1349 def __init__(self, downloader=None):
1350 InfoExtractor.__init__(self, downloader)
1352 def report_download_webpage(self, video_id):
1353 """Report webpage download."""
1354 if not self._downloader.params.get('test', False):
1355 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1362 def report_following_redirect(self, new_url):
1363 """Report information extraction."""
1364 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1366 def _test_redirect(self, url):
1367 """Check if it is a redirect, like url shorteners, in case return the new url."""
1368 class HeadRequest(compat_urllib_request.Request):
1369 def get_method(self):
1372 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1374 Subclass the HTTPRedirectHandler to make it use our
1375 HeadRequest also on the redirected URL
1377 def redirect_request(self, req, fp, code, msg, headers, newurl):
1378 if code in (301, 302, 303, 307):
1379 newurl = newurl.replace(' ', '%20')
1380 newheaders = dict((k,v) for k,v in req.headers.items()
1381 if k.lower() not in ("content-length", "content-type"))
1382 return HeadRequest(newurl,
1384 origin_req_host=req.get_origin_req_host(),
1387 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1389 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1391 Fallback to GET if HEAD is not allowed (405 HTTP error)
1393 def http_error_405(self, req, fp, code, msg, headers):
1397 newheaders = dict((k,v) for k,v in req.headers.items()
1398 if k.lower() not in ("content-length", "content-type"))
1399 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1401 origin_req_host=req.get_origin_req_host(),
1405 opener = compat_urllib_request.OpenerDirector()
1406 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407 HTTPMethodFallback, HEADRedirectHandler,
1408 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409 opener.add_handler(handler())
1411 response = opener.open(HeadRequest(url))
1412 new_url = response.geturl()
1417 self.report_following_redirect(new_url)
1420 def _real_extract(self, url):
1421 new_url = self._test_redirect(url)
1422 if new_url: return [self.url_result(new_url)]
1424 video_id = url.split('/')[-1]
1426 webpage = self._download_webpage(url, video_id)
1427 except ValueError as err:
1428 # since this is the last-resort InfoExtractor, if
1429 # this error is thrown, it'll be thrown here
1430 self._downloader.report_error(u'Invalid URL: %s' % url)
1433 self.report_extraction(video_id)
1434 # Start with something easy: JW Player in SWFObject
1435 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1437 # Broaden the search a little bit
1438 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1440 # Broaden the search a little bit: JWPlayer JS loader
1441 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1443 self._downloader.report_error(u'Invalid URL: %s' % url)
1446 # It's possible that one of the regexes
1447 # matched, but returned an empty group:
1448 if mobj.group(1) is None:
1449 self._downloader.report_error(u'Invalid URL: %s' % url)
1452 video_url = compat_urllib_parse.unquote(mobj.group(1))
1453 video_id = os.path.basename(video_url)
1455 # here's a fun little line of code for you:
1456 video_extension = os.path.splitext(video_id)[1][1:]
1457 video_id = os.path.splitext(video_id)[0]
1459 # it's tempting to parse this further, but you would
1460 # have to take into account all the variations like
1461 # Video Title - Site Name
1462 # Site Name | Video Title
1463 # Video Title - Tagline | Site Name
1464 # and so on and so forth; it's just not practical
1465 mobj = re.search(r'<title>(.*)</title>', webpage)
1467 self._downloader.report_error(u'unable to extract title')
1469 video_title = mobj.group(1)
1471 # video uploader is domain name
1472 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1474 self._downloader.report_error(u'unable to extract title')
1476 video_uploader = mobj.group(1)
1481 'uploader': video_uploader,
1482 'upload_date': None,
1483 'title': video_title,
1484 'ext': video_extension,
1488 class YoutubeSearchIE(InfoExtractor):
1489 """Information Extractor for YouTube search queries."""
1490 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492 _max_youtube_results = 1000
1493 IE_NAME = u'youtube:search'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1498 def report_download_page(self, query, pagenum):
1499 """Report attempt to download search page with given number."""
1500 query = query.decode(preferredencoding())
1501 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503 def _real_extract(self, query):
1504 mobj = re.match(self._VALID_URL, query)
1506 self._downloader.report_error(u'invalid search query "%s"' % query)
1509 prefix, query = query.split(':')
1511 query = query.encode('utf-8')
1513 self._download_n_results(query, 1)
1515 elif prefix == 'all':
1516 self._download_n_results(query, self._max_youtube_results)
1522 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1524 elif n > self._max_youtube_results:
1525 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526 n = self._max_youtube_results
1527 self._download_n_results(query, n)
1529 except ValueError: # parsing prefix as integer fails
1530 self._download_n_results(query, 1)
1533 def _download_n_results(self, query, n):
1534 """Downloads a specified number of results for a query"""
1540 while (50 * pagenum) < limit:
1541 self.report_download_page(query, pagenum+1)
1542 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543 request = compat_urllib_request.Request(result_url)
1545 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1549 api_response = json.loads(data)['data']
1551 if not 'items' in api_response:
1552 self._downloader.trouble(u'[youtube] No video results')
1555 new_ids = list(video['id'] for video in api_response['items'])
1556 video_ids += new_ids
1558 limit = min(n, api_response['totalItems'])
1561 if len(video_ids) > n:
1562 video_ids = video_ids[:n]
1563 for id in video_ids:
1564 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1568 class GoogleSearchIE(InfoExtractor):
1569 """Information Extractor for Google Video search queries."""
1570 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574 _max_google_results = 1000
1575 IE_NAME = u'video.google:search'
1577 def __init__(self, downloader=None):
1578 InfoExtractor.__init__(self, downloader)
1580 def report_download_page(self, query, pagenum):
1581 """Report attempt to download playlist page with given number."""
1582 query = query.decode(preferredencoding())
1583 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1585 def _real_extract(self, query):
1586 mobj = re.match(self._VALID_URL, query)
1588 self._downloader.report_error(u'invalid search query "%s"' % query)
1591 prefix, query = query.split(':')
1593 query = query.encode('utf-8')
1595 self._download_n_results(query, 1)
1597 elif prefix == 'all':
1598 self._download_n_results(query, self._max_google_results)
1604 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606 elif n > self._max_google_results:
1607 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608 n = self._max_google_results
1609 self._download_n_results(query, n)
1611 except ValueError: # parsing prefix as integer fails
1612 self._download_n_results(query, 1)
1615 def _download_n_results(self, query, n):
1616 """Downloads a specified number of results for a query"""
1622 self.report_download_page(query, pagenum)
1623 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624 request = compat_urllib_request.Request(result_url)
1626 page = compat_urllib_request.urlopen(request).read()
1627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1631 # Extract video identifiers
1632 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633 video_id = mobj.group(1)
1634 if video_id not in video_ids:
1635 video_ids.append(video_id)
1636 if len(video_ids) == n:
1637 # Specified n videos reached
1638 for id in video_ids:
1639 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643 for id in video_ids:
1644 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1647 pagenum = pagenum + 1
1650 class YahooSearchIE(InfoExtractor):
1651 """Information Extractor for Yahoo! Video search queries."""
1654 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657 _MORE_PAGES_INDICATOR = r'\s*Next'
1658 _max_yahoo_results = 1000
1659 IE_NAME = u'video.yahoo:search'
1661 def __init__(self, downloader=None):
1662 InfoExtractor.__init__(self, downloader)
1664 def report_download_page(self, query, pagenum):
1665 """Report attempt to download playlist page with given number."""
1666 query = query.decode(preferredencoding())
1667 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1669 def _real_extract(self, query):
1670 mobj = re.match(self._VALID_URL, query)
1672 self._downloader.report_error(u'invalid search query "%s"' % query)
1675 prefix, query = query.split(':')
1677 query = query.encode('utf-8')
1679 self._download_n_results(query, 1)
1681 elif prefix == 'all':
1682 self._download_n_results(query, self._max_yahoo_results)
1688 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1690 elif n > self._max_yahoo_results:
1691 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692 n = self._max_yahoo_results
1693 self._download_n_results(query, n)
1695 except ValueError: # parsing prefix as integer fails
1696 self._download_n_results(query, 1)
1699 def _download_n_results(self, query, n):
1700 """Downloads a specified number of results for a query"""
1703 already_seen = set()
1707 self.report_download_page(query, pagenum)
1708 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709 request = compat_urllib_request.Request(result_url)
1711 page = compat_urllib_request.urlopen(request).read()
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 video_id = mobj.group(1)
1719 if video_id not in already_seen:
1720 video_ids.append(video_id)
1721 already_seen.add(video_id)
1722 if len(video_ids) == n:
1723 # Specified n videos reached
1724 for id in video_ids:
1725 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1728 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729 for id in video_ids:
1730 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1733 pagenum = pagenum + 1
1736 class YoutubePlaylistIE(InfoExtractor):
1737 """Information Extractor for YouTube playlists."""
1739 _VALID_URL = r"""(?:
1744 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745 \? (?:.*?&)*? (?:p|a|list)=
1748 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1751 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1753 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1755 IE_NAME = u'youtube:playlist'
1757 def __init__(self, downloader=None):
1758 InfoExtractor.__init__(self, downloader)
1761 def suitable(cls, url):
1762 """Receives a URL and returns True if suitable for this IE."""
1763 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1765 def report_download_page(self, playlist_id, pagenum):
1766 """Report attempt to download playlist page with given number."""
1767 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1769 def _real_extract(self, url):
1770 # Extract playlist id
1771 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1773 self._downloader.report_error(u'invalid url: %s' % url)
1776 # Download playlist videos from API
1777 playlist_id = mobj.group(1) or mobj.group(2)
1782 self.report_download_page(playlist_id, page_num)
1784 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1786 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1792 response = json.loads(page)
1793 except ValueError as err:
1794 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1797 if not 'feed' in response or not 'entry' in response['feed']:
1798 self._downloader.report_error(u'Got a malformed response from YouTube API')
1800 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1801 for entry in response['feed']['entry']
1802 if 'content' in entry ]
1804 if len(response['feed']['entry']) < self._MAX_RESULTS:
1808 videos = [v[1] for v in sorted(videos)]
1811 playliststart = self._downloader.params.get('playliststart', 1) - 1
1812 playlistend = self._downloader.params.get('playlistend', -1)
1813 if playlistend == -1:
1814 videos = videos[playliststart:]
1816 videos = videos[playliststart:playlistend]
1818 if len(videos) == total:
1819 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1821 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1823 url_results = [self.url_result(url) for url in videos]
1824 return [self.playlist_result(url_results, playlist_id)]
1827 class YoutubeChannelIE(InfoExtractor):
1828 """Information Extractor for YouTube channels."""
1830 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1831 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1832 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1833 IE_NAME = u'youtube:channel'
1835 def report_download_page(self, channel_id, pagenum):
1836 """Report attempt to download channel page with given number."""
1837 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1839 def _real_extract(self, url):
1840 # Extract channel id
1841 mobj = re.match(self._VALID_URL, url)
1843 self._downloader.report_error(u'invalid url: %s' % url)
1846 # Download channel pages
1847 channel_id = mobj.group(1)
1852 self.report_download_page(channel_id, pagenum)
1853 url = self._TEMPLATE_URL % (channel_id, pagenum)
1854 request = compat_urllib_request.Request(url)
1856 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1858 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1861 # Extract video identifiers
1863 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1864 if mobj.group(1) not in ids_in_page:
1865 ids_in_page.append(mobj.group(1))
1866 video_ids.extend(ids_in_page)
1868 if self._MORE_PAGES_INDICATOR not in page:
1870 pagenum = pagenum + 1
1872 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1874 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1875 url_entries = [self.url_result(url) for url in urls]
1876 return [self.playlist_result(url_entries, channel_id)]
1879 class YoutubeUserIE(InfoExtractor):
1880 """Information Extractor for YouTube users."""
1882 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1883 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1884 _GDATA_PAGE_SIZE = 50
1885 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1886 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1887 IE_NAME = u'youtube:user'
1889 def __init__(self, downloader=None):
1890 InfoExtractor.__init__(self, downloader)
1892 def report_download_page(self, username, start_index):
1893 """Report attempt to download user page."""
1894 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1895 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1897 def _real_extract(self, url):
1899 mobj = re.match(self._VALID_URL, url)
1901 self._downloader.report_error(u'invalid url: %s' % url)
1904 username = mobj.group(1)
1906 # Download video ids using YouTube Data API. Result size per
1907 # query is limited (currently to 50 videos) so we need to query
1908 # page by page until there are no video ids - it means we got
1915 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1916 self.report_download_page(username, start_index)
1918 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1921 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1922 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1923 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1926 # Extract video identifiers
1929 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1930 if mobj.group(1) not in ids_in_page:
1931 ids_in_page.append(mobj.group(1))
1933 video_ids.extend(ids_in_page)
1935 # A little optimization - if current page is not
1936 # "full", ie. does not contain PAGE_SIZE video ids then
1937 # we can assume that this page is the last one - there
1938 # are no more ids on further pages - no need to query
1941 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1946 all_ids_count = len(video_ids)
1947 playliststart = self._downloader.params.get('playliststart', 1) - 1
1948 playlistend = self._downloader.params.get('playlistend', -1)
1950 if playlistend == -1:
1951 video_ids = video_ids[playliststart:]
1953 video_ids = video_ids[playliststart:playlistend]
1955 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1956 (username, all_ids_count, len(video_ids)))
1958 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1959 url_results = [self.url_result(url) for url in urls]
1960 return [self.playlist_result(url_results, playlist_title = username)]
1963 class BlipTVUserIE(InfoExtractor):
1964 """Information Extractor for blip.tv users."""
1966 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1968 IE_NAME = u'blip.tv:user'
1970 def __init__(self, downloader=None):
1971 InfoExtractor.__init__(self, downloader)
1973 def report_download_page(self, username, pagenum):
1974 """Report attempt to download user page."""
1975 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1976 (self.IE_NAME, username, pagenum))
1978 def _real_extract(self, url):
1980 mobj = re.match(self._VALID_URL, url)
1982 self._downloader.report_error(u'invalid url: %s' % url)
1985 username = mobj.group(1)
1987 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1989 request = compat_urllib_request.Request(url)
1992 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1993 mobj = re.search(r'data-users-id="([^"]+)"', page)
1994 page_base = page_base % mobj.group(1)
1995 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2000 # Download video ids using BlipTV Ajax calls. Result size per
2001 # query is limited (currently to 12 videos) so we need to query
2002 # page by page until there are no video ids - it means we got
2009 self.report_download_page(username, pagenum)
2010 url = page_base + "&page=" + str(pagenum)
2011 request = compat_urllib_request.Request( url )
2013 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2014 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2015 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2018 # Extract video identifiers
2021 for mobj in re.finditer(r'href="/([^"]+)"', page):
2022 if mobj.group(1) not in ids_in_page:
2023 ids_in_page.append(unescapeHTML(mobj.group(1)))
2025 video_ids.extend(ids_in_page)
2027 # A little optimization - if current page is not
2028 # "full", ie. does not contain PAGE_SIZE video ids then
2029 # we can assume that this page is the last one - there
2030 # are no more ids on further pages - no need to query
2033 if len(ids_in_page) < self._PAGE_SIZE:
2038 all_ids_count = len(video_ids)
2039 playliststart = self._downloader.params.get('playliststart', 1) - 1
2040 playlistend = self._downloader.params.get('playlistend', -1)
2042 if playlistend == -1:
2043 video_ids = video_ids[playliststart:]
2045 video_ids = video_ids[playliststart:playlistend]
2047 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2048 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2050 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2051 url_entries = [self.url_result(url) for url in urls]
2052 return [self.playlist_result(url_entries, playlist_title = username)]
2055 class DepositFilesIE(InfoExtractor):
2056 """Information extractor for depositfiles.com"""
2058 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2060 def report_download_webpage(self, file_id):
2061 """Report webpage download."""
2062 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2064 def report_extraction(self, file_id):
2065 """Report information extraction."""
2066 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2068 def _real_extract(self, url):
2069 file_id = url.split('/')[-1]
2070 # Rebuild url in english locale
2071 url = 'http://depositfiles.com/en/files/' + file_id
2073 # Retrieve file webpage with 'Free download' button pressed
2074 free_download_indication = { 'gateway_result' : '1' }
2075 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2077 self.report_download_webpage(file_id)
2078 webpage = compat_urllib_request.urlopen(request).read()
2079 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2080 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2083 # Search for the real file URL
2084 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2085 if (mobj is None) or (mobj.group(1) is None):
2086 # Try to figure out reason of the error.
2087 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2088 if (mobj is not None) and (mobj.group(1) is not None):
2089 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2090 self._downloader.report_error(u'%s' % restriction_message)
2092 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2095 file_url = mobj.group(1)
2096 file_extension = os.path.splitext(file_url)[1][1:]
2098 # Search for file title
2099 mobj = re.search(r'<b title="(.*?)">', webpage)
2101 self._downloader.report_error(u'unable to extract title')
2103 file_title = mobj.group(1).decode('utf-8')
2106 'id': file_id.decode('utf-8'),
2107 'url': file_url.decode('utf-8'),
2109 'upload_date': None,
2110 'title': file_title,
2111 'ext': file_extension.decode('utf-8'),
2115 class FacebookIE(InfoExtractor):
2116 """Information Extractor for Facebook"""
2118 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2119 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2120 _NETRC_MACHINE = 'facebook'
2121 IE_NAME = u'facebook'
2123 def report_login(self):
2124 """Report attempt to log in."""
2125 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2127 def _real_initialize(self):
2128 if self._downloader is None:
2133 downloader_params = self._downloader.params
2135 # Attempt to use provided username and password or .netrc data
2136 if downloader_params.get('username', None) is not None:
2137 useremail = downloader_params['username']
2138 password = downloader_params['password']
2139 elif downloader_params.get('usenetrc', False):
2141 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2142 if info is not None:
2146 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2147 except (IOError, netrc.NetrcParseError) as err:
2148 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2151 if useremail is None:
2160 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2163 login_results = compat_urllib_request.urlopen(request).read()
2164 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2165 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2167 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2168 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2171 def _real_extract(self, url):
2172 mobj = re.match(self._VALID_URL, url)
2174 self._downloader.report_error(u'invalid URL: %s' % url)
2176 video_id = mobj.group('ID')
2178 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2179 webpage = self._download_webpage(url, video_id)
2181 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2182 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2183 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2185 raise ExtractorError(u'Cannot parse data')
2186 data = dict(json.loads(m.group(1)))
2187 params_raw = compat_urllib_parse.unquote(data['params'])
2188 params = json.loads(params_raw)
2189 video_url = params['hd_src']
2191 video_url = params['sd_src']
2193 raise ExtractorError(u'Cannot find video URL')
2194 video_duration = int(params['video_duration'])
2196 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2198 raise ExtractorError(u'Cannot find title in webpage')
2199 video_title = unescapeHTML(m.group(1))
2203 'title': video_title,
2206 'duration': video_duration,
2207 'thumbnail': params['thumbnail_src'],
2212 class BlipTVIE(InfoExtractor):
2213 """Information extractor for blip.tv"""
2215 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2216 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2217 IE_NAME = u'blip.tv'
2219 def report_extraction(self, file_id):
2220 """Report information extraction."""
2221 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2223 def report_direct_download(self, title):
2224 """Report information extraction."""
2225 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2227 def _real_extract(self, url):
2228 mobj = re.match(self._VALID_URL, url)
2230 self._downloader.report_error(u'invalid URL: %s' % url)
2233 urlp = compat_urllib_parse_urlparse(url)
2234 if urlp.path.startswith('/play/'):
2235 request = compat_urllib_request.Request(url)
2236 response = compat_urllib_request.urlopen(request)
2237 redirecturl = response.geturl()
2238 rurlp = compat_urllib_parse_urlparse(redirecturl)
2239 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2240 url = 'http://blip.tv/a/a-' + file_id
2241 return self._real_extract(url)
2248 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2249 request = compat_urllib_request.Request(json_url)
2250 request.add_header('User-Agent', 'iTunes/10.6.1')
2251 self.report_extraction(mobj.group(1))
2254 urlh = compat_urllib_request.urlopen(request)
2255 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2256 basename = url.split('/')[-1]
2257 title,ext = os.path.splitext(basename)
2258 title = title.decode('UTF-8')
2259 ext = ext.replace('.', '')
2260 self.report_direct_download(title)
2265 'upload_date': None,
2270 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2271 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2272 if info is None: # Regular URL
2274 json_code_bytes = urlh.read()
2275 json_code = json_code_bytes.decode('utf-8')
2276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2277 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2281 json_data = json.loads(json_code)
2282 if 'Post' in json_data:
2283 data = json_data['Post']
2287 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2288 video_url = data['media']['url']
2289 umobj = re.match(self._URL_EXT, video_url)
2291 raise ValueError('Can not determine filename extension')
2292 ext = umobj.group(1)
2295 'id': data['item_id'],
2297 'uploader': data['display_name'],
2298 'upload_date': upload_date,
2299 'title': data['title'],
2301 'format': data['media']['mimeType'],
2302 'thumbnail': data['thumbnailUrl'],
2303 'description': data['description'],
2304 'player_url': data['embedUrl'],
2305 'user_agent': 'iTunes/10.6.1',
2307 except (ValueError,KeyError) as err:
2308 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2314 class MyVideoIE(InfoExtractor):
2315 """Information Extractor for myvideo.de."""
2317 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2318 IE_NAME = u'myvideo'
2320 def __init__(self, downloader=None):
2321 InfoExtractor.__init__(self, downloader)
2323 def report_extraction(self, video_id):
2324 """Report information extraction."""
2325 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2327 def _real_extract(self,url):
2328 mobj = re.match(self._VALID_URL, url)
2330 self._download.report_error(u'invalid URL: %s' % url)
2333 video_id = mobj.group(1)
2336 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2337 webpage = self._download_webpage(webpage_url, video_id)
2339 self.report_extraction(video_id)
2340 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2343 self._downloader.report_error(u'unable to extract media URL')
2345 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2347 mobj = re.search('<title>([^<]+)</title>', webpage)
2349 self._downloader.report_error(u'unable to extract title')
2352 video_title = mobj.group(1)
2358 'upload_date': None,
2359 'title': video_title,
2363 class ComedyCentralIE(InfoExtractor):
2364 """Information extractor for The Daily Show and Colbert Report """
2366 # urls can be abbreviations like :thedailyshow or :colbert
2367 # urls for episodes like:
2368 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2369 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2370 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2371 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2372 |(https?://)?(www\.)?
2373 (?P<showname>thedailyshow|colbertnation)\.com/
2374 (full-episodes/(?P<episode>.*)|
2376 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2377 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2380 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2382 _video_extensions = {
2390 _video_dimensions = {
2400 def suitable(cls, url):
2401 """Receives a URL and returns True if suitable for this IE."""
2402 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2404 def report_extraction(self, episode_id):
2405 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2407 def report_config_download(self, episode_id, media_id):
2408 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2410 def report_index_download(self, episode_id):
2411 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2413 def _print_formats(self, formats):
2414 print('Available formats:')
2416 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2419 def _real_extract(self, url):
2420 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422 self._downloader.report_error(u'invalid URL: %s' % url)
2425 if mobj.group('shortname'):
2426 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2427 url = u'http://www.thedailyshow.com/full-episodes/'
2429 url = u'http://www.colbertnation.com/full-episodes/'
2430 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2431 assert mobj is not None
2433 if mobj.group('clip'):
2434 if mobj.group('showname') == 'thedailyshow':
2435 epTitle = mobj.group('tdstitle')
2437 epTitle = mobj.group('cntitle')
2440 dlNewest = not mobj.group('episode')
2442 epTitle = mobj.group('showname')
2444 epTitle = mobj.group('episode')
2446 req = compat_urllib_request.Request(url)
2447 self.report_extraction(epTitle)
2449 htmlHandle = compat_urllib_request.urlopen(req)
2450 html = htmlHandle.read()
2451 webpage = html.decode('utf-8')
2452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2456 url = htmlHandle.geturl()
2457 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2459 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2461 if mobj.group('episode') == '':
2462 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2464 epTitle = mobj.group('episode')
2466 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2468 if len(mMovieParams) == 0:
2469 # The Colbert Report embeds the information in a without
2470 # a URL prefix; so extract the alternate reference
2471 # and then add the URL prefix manually.
2473 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2474 if len(altMovieParams) == 0:
2475 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2478 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2480 uri = mMovieParams[0][1]
2481 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2482 self.report_index_download(epTitle)
2484 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2491 idoc = xml.etree.ElementTree.fromstring(indexXml)
2492 itemEls = idoc.findall('.//item')
2493 for partNum,itemEl in enumerate(itemEls):
2494 mediaId = itemEl.findall('./guid')[0].text
2495 shortMediaId = mediaId.split(':')[-1]
2496 showId = mediaId.split(':')[-2].replace('.com', '')
2497 officialTitle = itemEl.findall('./title')[0].text
2498 officialDate = itemEl.findall('./pubDate')[0].text
2500 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2501 compat_urllib_parse.urlencode({'uri': mediaId}))
2502 configReq = compat_urllib_request.Request(configUrl)
2503 self.report_config_download(epTitle, shortMediaId)
2505 configXml = compat_urllib_request.urlopen(configReq).read()
2506 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2507 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2510 cdoc = xml.etree.ElementTree.fromstring(configXml)
2512 for rendition in cdoc.findall('.//rendition'):
2513 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2517 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2520 if self._downloader.params.get('listformats', None):
2521 self._print_formats([i[0] for i in turls])
2524 # For now, just pick the highest bitrate
2525 format,rtmp_video_url = turls[-1]
2527 # Get the format arg from the arg stream
2528 req_format = self._downloader.params.get('format', None)
2530 # Select format if we can find one
2533 format, rtmp_video_url = f, v
2536 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2538 raise ExtractorError(u'Cannot transform RTMP url')
2539 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2540 video_url = base + m.group('finalid')
2542 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2547 'upload_date': officialDate,
2552 'description': officialTitle,
2554 results.append(info)
2559 class EscapistIE(InfoExtractor):
2560 """Information extractor for The Escapist """
2562 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2563 IE_NAME = u'escapist'
2565 def report_extraction(self, showName):
2566 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2568 def report_config_download(self, showName):
2569 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2571 def _real_extract(self, url):
2572 mobj = re.match(self._VALID_URL, url)
2574 self._downloader.report_error(u'invalid URL: %s' % url)
2576 showName = mobj.group('showname')
2577 videoId = mobj.group('episode')
2579 self.report_extraction(showName)
2581 webPage = compat_urllib_request.urlopen(url)
2582 webPageBytes = webPage.read()
2583 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2584 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2589 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2590 description = unescapeHTML(descMatch.group(1))
2591 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2592 imgUrl = unescapeHTML(imgMatch.group(1))
2593 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2594 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2595 configUrlMatch = re.search('config=(.*)$', playerUrl)
2596 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2598 self.report_config_download(showName)
2600 configJSON = compat_urllib_request.urlopen(configUrl)
2601 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2602 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2603 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2604 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2607 # Technically, it's JavaScript, not JSON
2608 configJSON = configJSON.replace("'", '"')
2611 config = json.loads(configJSON)
2612 except (ValueError,) as err:
2613 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2616 playlist = config['playlist']
2617 videoUrl = playlist[1]['url']
2622 'uploader': showName,
2623 'upload_date': None,
2626 'thumbnail': imgUrl,
2627 'description': description,
2628 'player_url': playerUrl,
2633 class CollegeHumorIE(InfoExtractor):
2634 """Information extractor for collegehumor.com"""
2637 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2638 IE_NAME = u'collegehumor'
2640 def report_manifest(self, video_id):
2641 """Report information extraction."""
2642 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2644 def report_extraction(self, video_id):
2645 """Report information extraction."""
2646 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2648 def _real_extract(self, url):
2649 mobj = re.match(self._VALID_URL, url)
2651 self._downloader.report_error(u'invalid URL: %s' % url)
2653 video_id = mobj.group('videoid')
2658 'upload_date': None,
2661 self.report_extraction(video_id)
2662 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2664 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2665 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2666 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2669 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2671 videoNode = mdoc.findall('./video')[0]
2672 info['description'] = videoNode.findall('./description')[0].text
2673 info['title'] = videoNode.findall('./caption')[0].text
2674 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2675 manifest_url = videoNode.findall('./file')[0].text
2677 self._downloader.report_error(u'Invalid metadata XML file')
2680 manifest_url += '?hdcore=2.10.3'
2681 self.report_manifest(video_id)
2683 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2684 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2685 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2688 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2690 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2691 node_id = media_node.attrib['url']
2692 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2693 except IndexError as err:
2694 self._downloader.report_error(u'Invalid manifest file')
2697 url_pr = compat_urllib_parse_urlparse(manifest_url)
2698 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2705 class XVideosIE(InfoExtractor):
2706 """Information extractor for xvideos.com"""
2708 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2709 IE_NAME = u'xvideos'
2711 def report_extraction(self, video_id):
2712 """Report information extraction."""
2713 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2715 def _real_extract(self, url):
2716 mobj = re.match(self._VALID_URL, url)
2718 self._downloader.report_error(u'invalid URL: %s' % url)
2720 video_id = mobj.group(1)
2722 webpage = self._download_webpage(url, video_id)
2724 self.report_extraction(video_id)
2728 mobj = re.search(r'flv_url=(.+?)&', webpage)
2730 self._downloader.report_error(u'unable to extract video url')
2732 video_url = compat_urllib_parse.unquote(mobj.group(1))
2736 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2738 self._downloader.report_error(u'unable to extract video title')
2740 video_title = mobj.group(1)
2743 # Extract video thumbnail
2744 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2746 self._downloader.report_error(u'unable to extract video thumbnail')
2748 video_thumbnail = mobj.group(0)
2754 'upload_date': None,
2755 'title': video_title,
2757 'thumbnail': video_thumbnail,
2758 'description': None,
2764 class SoundcloudIE(InfoExtractor):
2765 """Information extractor for soundcloud.com
2766 To access the media, the uid of the song and a stream token
2767 must be extracted from the page source and the script must make
2768 a request to media.soundcloud.com/crossdomain.xml. Then
2769 the media can be grabbed by requesting from an url composed
2770 of the stream token and uid
2773 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2774 IE_NAME = u'soundcloud'
2776 def __init__(self, downloader=None):
2777 InfoExtractor.__init__(self, downloader)
2779 def report_resolve(self, video_id):
2780 """Report information extraction."""
2781 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2783 def report_extraction(self, video_id):
2784 """Report information extraction."""
2785 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2787 def _real_extract(self, url):
2788 mobj = re.match(self._VALID_URL, url)
2790 self._downloader.report_error(u'invalid URL: %s' % url)
2793 # extract uploader (which is in the url)
2794 uploader = mobj.group(1)
2795 # extract simple title (uploader + slug of song title)
2796 slug_title = mobj.group(2)
2797 simple_title = uploader + u'-' + slug_title
2799 self.report_resolve('%s/%s' % (uploader, slug_title))
2801 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2802 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2803 request = compat_urllib_request.Request(resolv_url)
2805 info_json_bytes = compat_urllib_request.urlopen(request).read()
2806 info_json = info_json_bytes.decode('utf-8')
2807 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2808 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2811 info = json.loads(info_json)
2812 video_id = info['id']
2813 self.report_extraction('%s/%s' % (uploader, slug_title))
2815 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2816 request = compat_urllib_request.Request(streams_url)
2818 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2819 stream_json = stream_json_bytes.decode('utf-8')
2820 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2821 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2824 streams = json.loads(stream_json)
2825 mediaURL = streams['http_mp3_128_url']
2830 'uploader': info['user']['username'],
2831 'upload_date': info['created_at'],
2832 'title': info['title'],
2834 'description': info['description'],
2837 class SoundcloudSetIE(InfoExtractor):
2838 """Information extractor for soundcloud.com sets
2839 To access the media, the uid of the song and a stream token
2840 must be extracted from the page source and the script must make
2841 a request to media.soundcloud.com/crossdomain.xml. Then
2842 the media can be grabbed by requesting from an url composed
2843 of the stream token and uid
2846 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2847 IE_NAME = u'soundcloud'
2849 def __init__(self, downloader=None):
2850 InfoExtractor.__init__(self, downloader)
2852 def report_resolve(self, video_id):
2853 """Report information extraction."""
2854 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2856 def report_extraction(self, video_id):
2857 """Report information extraction."""
2858 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2860 def _real_extract(self, url):
2861 mobj = re.match(self._VALID_URL, url)
2863 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2866 # extract uploader (which is in the url)
2867 uploader = mobj.group(1)
2868 # extract simple title (uploader + slug of song title)
2869 slug_title = mobj.group(2)
2870 simple_title = uploader + u'-' + slug_title
2872 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2874 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2875 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2876 request = compat_urllib_request.Request(resolv_url)
2878 info_json_bytes = compat_urllib_request.urlopen(request).read()
2879 info_json = info_json_bytes.decode('utf-8')
2880 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2881 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2885 info = json.loads(info_json)
2886 if 'errors' in info:
2887 for err in info['errors']:
2888 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2891 for track in info['tracks']:
2892 video_id = track['id']
2893 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2895 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2896 request = compat_urllib_request.Request(streams_url)
2898 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2899 stream_json = stream_json_bytes.decode('utf-8')
2900 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2901 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2904 streams = json.loads(stream_json)
2905 mediaURL = streams['http_mp3_128_url']
2910 'uploader': track['user']['username'],
2911 'upload_date': track['created_at'],
2912 'title': track['title'],
2914 'description': track['description'],
2919 class InfoQIE(InfoExtractor):
2920 """Information extractor for infoq.com"""
2921 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2923 def report_extraction(self, video_id):
2924 """Report information extraction."""
2925 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2927 def _real_extract(self, url):
2928 mobj = re.match(self._VALID_URL, url)
2930 self._downloader.report_error(u'invalid URL: %s' % url)
2933 webpage = self._download_webpage(url, video_id=url)
2934 self.report_extraction(url)
2937 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2939 self._downloader.report_error(u'unable to extract video url')
2941 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2942 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2945 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2947 self._downloader.report_error(u'unable to extract video title')
2949 video_title = mobj.group(1)
2951 # Extract description
2952 video_description = u'No description available.'
2953 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2954 if mobj is not None:
2955 video_description = mobj.group(1)
2957 video_filename = video_url.split('/')[-1]
2958 video_id, extension = video_filename.split('.')
2964 'upload_date': None,
2965 'title': video_title,
2966 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2968 'description': video_description,
2973 class MixcloudIE(InfoExtractor):
2974 """Information extractor for www.mixcloud.com"""
2976 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2977 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2978 IE_NAME = u'mixcloud'
2980 def __init__(self, downloader=None):
2981 InfoExtractor.__init__(self, downloader)
2983 def report_download_json(self, file_id):
2984 """Report JSON download."""
2985 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2987 def report_extraction(self, file_id):
2988 """Report information extraction."""
2989 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2991 def get_urls(self, jsonData, fmt, bitrate='best'):
2992 """Get urls from 'audio_formats' section in json"""
2995 bitrate_list = jsonData[fmt]
2996 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2997 bitrate = max(bitrate_list) # select highest
2999 url_list = jsonData[fmt][bitrate]
3000 except TypeError: # we have no bitrate info.
3001 url_list = jsonData[fmt]
3004 def check_urls(self, url_list):
3005 """Returns 1st active url from list"""
3006 for url in url_list:
3008 compat_urllib_request.urlopen(url)
3010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3015 def _print_formats(self, formats):
3016 print('Available formats:')
3017 for fmt in formats.keys():
3018 for b in formats[fmt]:
3020 ext = formats[fmt][b][0]
3021 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3022 except TypeError: # we have no bitrate info
3023 ext = formats[fmt][0]
3024 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3027 def _real_extract(self, url):
3028 mobj = re.match(self._VALID_URL, url)
3030 self._downloader.report_error(u'invalid URL: %s' % url)
3032 # extract uploader & filename from url
3033 uploader = mobj.group(1).decode('utf-8')
3034 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3036 # construct API request
3037 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3038 # retrieve .json file with links to files
3039 request = compat_urllib_request.Request(file_url)
3041 self.report_download_json(file_url)
3042 jsonData = compat_urllib_request.urlopen(request).read()
3043 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3044 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3048 json_data = json.loads(jsonData)
3049 player_url = json_data['player_swf_url']
3050 formats = dict(json_data['audio_formats'])
3052 req_format = self._downloader.params.get('format', None)
3055 if self._downloader.params.get('listformats', None):
3056 self._print_formats(formats)
3059 if req_format is None or req_format == 'best':
3060 for format_param in formats.keys():
3061 url_list = self.get_urls(formats, format_param)
3063 file_url = self.check_urls(url_list)
3064 if file_url is not None:
3067 if req_format not in formats:
3068 self._downloader.report_error(u'format is not available')
3071 url_list = self.get_urls(formats, req_format)
3072 file_url = self.check_urls(url_list)
3073 format_param = req_format
3076 'id': file_id.decode('utf-8'),
3077 'url': file_url.decode('utf-8'),
3078 'uploader': uploader.decode('utf-8'),
3079 'upload_date': None,
3080 'title': json_data['name'],
3081 'ext': file_url.split('.')[-1].decode('utf-8'),
3082 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3083 'thumbnail': json_data['thumbnail_url'],
3084 'description': json_data['description'],
3085 'player_url': player_url.decode('utf-8'),
3088 class StanfordOpenClassroomIE(InfoExtractor):
3089 """Information extractor for Stanford's Open ClassRoom"""
3091 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3092 IE_NAME = u'stanfordoc'
3094 def report_download_webpage(self, objid):
3095 """Report information extraction."""
3096 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3098 def report_extraction(self, video_id):
3099 """Report information extraction."""
3100 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3102 def _real_extract(self, url):
3103 mobj = re.match(self._VALID_URL, url)
3105 raise ExtractorError(u'Invalid URL: %s' % url)
3107 if mobj.group('course') and mobj.group('video'): # A specific video
3108 course = mobj.group('course')
3109 video = mobj.group('video')
3111 'id': course + '_' + video,
3113 'upload_date': None,
3116 self.report_extraction(info['id'])
3117 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3118 xmlUrl = baseUrl + video + '.xml'
3120 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3122 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3124 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3126 info['title'] = mdoc.findall('./title')[0].text
3127 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3129 self._downloader.report_error(u'Invalid metadata XML file')
3131 info['ext'] = info['url'].rpartition('.')[2]
3133 elif mobj.group('course'): # A course page
3134 course = mobj.group('course')
3139 'upload_date': None,
3142 coursepage = self._download_webpage(url, info['id'],
3143 note='Downloading course info page',
3144 errnote='Unable to download course info page')
3146 m = re.search('<h1>([^<]+)</h1>', coursepage)
3148 info['title'] = unescapeHTML(m.group(1))
3150 info['title'] = info['id']
3152 m = re.search('<description>([^<]+)</description>', coursepage)
3154 info['description'] = unescapeHTML(m.group(1))
3156 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3159 'type': 'reference',
3160 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3164 for entry in info['list']:
3165 assert entry['type'] == 'reference'
3166 results += self.extract(entry['url'])
3170 'id': 'Stanford OpenClassroom',
3173 'upload_date': None,
3176 self.report_download_webpage(info['id'])
3177 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3179 rootpage = compat_urllib_request.urlopen(rootURL).read()
3180 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3181 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3184 info['title'] = info['id']
3186 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3189 'type': 'reference',
3190 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3195 for entry in info['list']:
3196 assert entry['type'] == 'reference'
3197 results += self.extract(entry['url'])
3200 class MTVIE(InfoExtractor):
3201 """Information extractor for MTV.com"""
3203 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3206 def report_extraction(self, video_id):
3207 """Report information extraction."""
3208 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3210 def _real_extract(self, url):
3211 mobj = re.match(self._VALID_URL, url)
3213 self._downloader.report_error(u'invalid URL: %s' % url)
3215 if not mobj.group('proto'):
3216 url = 'http://' + url
3217 video_id = mobj.group('videoid')
3219 webpage = self._download_webpage(url, video_id)
3221 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3223 self._downloader.report_error(u'unable to extract song name')
3225 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3226 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3228 self._downloader.report_error(u'unable to extract performer')
3230 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3231 video_title = performer + ' - ' + song_name
3233 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3235 self._downloader.report_error(u'unable to mtvn_uri')
3237 mtvn_uri = mobj.group(1)
3239 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3241 self._downloader.report_error(u'unable to extract content id')
3243 content_id = mobj.group(1)
3245 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3246 self.report_extraction(video_id)
3247 request = compat_urllib_request.Request(videogen_url)
3249 metadataXml = compat_urllib_request.urlopen(request).read()
3250 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3251 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3254 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3255 renditions = mdoc.findall('.//rendition')
3257 # For now, always pick the highest quality.
3258 rendition = renditions[-1]
3261 _,_,ext = rendition.attrib['type'].partition('/')
3262 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3263 video_url = rendition.find('./src').text
3265 self._downloader.trouble('Invalid rendition field.')
3271 'uploader': performer,
3272 'upload_date': None,
3273 'title': video_title,
3281 class YoukuIE(InfoExtractor):
3282 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3284 def report_download_webpage(self, file_id):
3285 """Report webpage download."""
3286 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3288 def report_extraction(self, file_id):
3289 """Report information extraction."""
3290 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3293 nowTime = int(time.time() * 1000)
3294 random1 = random.randint(1000,1998)
3295 random2 = random.randint(1000,9999)
3297 return "%d%d%d" %(nowTime,random1,random2)
3299 def _get_file_ID_mix_string(self, seed):
3301 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3303 for i in range(len(source)):
3304 seed = (seed * 211 + 30031 ) % 65536
3305 index = math.floor(seed / 65536 * len(source) )
3306 mixed.append(source[int(index)])
3307 source.remove(source[int(index)])
3308 #return ''.join(mixed)
3311 def _get_file_id(self, fileId, seed):
3312 mixed = self._get_file_ID_mix_string(seed)
3313 ids = fileId.split('*')
3317 realId.append(mixed[int(ch)])
3318 return ''.join(realId)
3320 def _real_extract(self, url):
3321 mobj = re.match(self._VALID_URL, url)
3323 self._downloader.report_error(u'invalid URL: %s' % url)
3325 video_id = mobj.group('ID')
3327 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3329 request = compat_urllib_request.Request(info_url, None, std_headers)
3331 self.report_download_webpage(video_id)
3332 jsondata = compat_urllib_request.urlopen(request).read()
3333 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3334 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3337 self.report_extraction(video_id)
3339 jsonstr = jsondata.decode('utf-8')
3340 config = json.loads(jsonstr)
3342 video_title = config['data'][0]['title']
3343 seed = config['data'][0]['seed']
3345 format = self._downloader.params.get('format', None)
3346 supported_format = list(config['data'][0]['streamfileids'].keys())
3348 if format is None or format == 'best':
3349 if 'hd2' in supported_format:
3354 elif format == 'worst':
3362 fileid = config['data'][0]['streamfileids'][format]
3363 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3364 except (UnicodeDecodeError, ValueError, KeyError):
3365 self._downloader.report_error(u'unable to extract info section')
3369 sid = self._gen_sid()
3370 fileid = self._get_file_id(fileid, seed)
3372 #column 8,9 of fileid represent the segment number
3373 #fileid[7:9] should be changed
3374 for index, key in enumerate(keys):
3376 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3377 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3380 'id': '%s_part%02d' % (video_id, index),
3381 'url': download_url,
3383 'upload_date': None,
3384 'title': video_title,
3387 files_info.append(info)
3392 class XNXXIE(InfoExtractor):
3393 """Information extractor for xnxx.com"""
3395 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3397 VIDEO_URL_RE = r'flv_url=(.*?)&'
3398 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3399 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3401 def report_webpage(self, video_id):
3402 """Report information extraction"""
3403 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3405 def report_extraction(self, video_id):
3406 """Report information extraction"""
3407 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3409 def _real_extract(self, url):
3410 mobj = re.match(self._VALID_URL, url)
3412 self._downloader.report_error(u'invalid URL: %s' % url)
3414 video_id = mobj.group(1)
3416 self.report_webpage(video_id)
3418 # Get webpage content
3420 webpage_bytes = compat_urllib_request.urlopen(url).read()
3421 webpage = webpage_bytes.decode('utf-8')
3422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3423 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3426 result = re.search(self.VIDEO_URL_RE, webpage)
3428 self._downloader.report_error(u'unable to extract video url')
3430 video_url = compat_urllib_parse.unquote(result.group(1))
3432 result = re.search(self.VIDEO_TITLE_RE, webpage)
3434 self._downloader.report_error(u'unable to extract video title')
3436 video_title = result.group(1)
3438 result = re.search(self.VIDEO_THUMB_RE, webpage)
3440 self._downloader.report_error(u'unable to extract video thumbnail')
3442 video_thumbnail = result.group(1)
3448 'upload_date': None,
3449 'title': video_title,
3451 'thumbnail': video_thumbnail,
3452 'description': None,
3456 class GooglePlusIE(InfoExtractor):
3457 """Information extractor for plus.google.com."""
3459 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3460 IE_NAME = u'plus.google'
3462 def __init__(self, downloader=None):
3463 InfoExtractor.__init__(self, downloader)
3465 def report_extract_entry(self, url):
3466 """Report downloading extry"""
3467 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3469 def report_date(self, upload_date):
3470 """Report downloading extry"""
3471 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3473 def report_uploader(self, uploader):
3474 """Report downloading extry"""
3475 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3477 def report_title(self, video_title):
3478 """Report downloading extry"""
3479 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3481 def report_extract_vid_page(self, video_page):
3482 """Report information extraction."""
3483 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3485 def _real_extract(self, url):
3486 # Extract id from URL
3487 mobj = re.match(self._VALID_URL, url)
3489 self._downloader.report_error(u'Invalid URL: %s' % url)
3492 post_url = mobj.group(0)
3493 video_id = mobj.group(1)
3495 video_extension = 'flv'
3497 # Step 1, Retrieve post webpage to extract further information
3498 self.report_extract_entry(post_url)
3499 request = compat_urllib_request.Request(post_url)
3501 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3502 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3503 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3506 # Extract update date
3508 pattern = 'title="Timestamp">(.*?)</a>'
3509 mobj = re.search(pattern, webpage)
3511 upload_date = mobj.group(1)
3512 # Convert timestring to a format suitable for filename
3513 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3514 upload_date = upload_date.strftime('%Y%m%d')
3515 self.report_date(upload_date)
3519 pattern = r'rel\="author".*?>(.*?)</a>'
3520 mobj = re.search(pattern, webpage)
3522 uploader = mobj.group(1)
3523 self.report_uploader(uploader)
3526 # Get the first line for title
3528 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3529 mobj = re.search(pattern, webpage)
3531 video_title = mobj.group(1)
3532 self.report_title(video_title)
3534 # Step 2, Stimulate clicking the image box to launch video
3535 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3536 mobj = re.search(pattern, webpage)
3538 self._downloader.report_error(u'unable to extract video page URL')
3540 video_page = mobj.group(1)
3541 request = compat_urllib_request.Request(video_page)
3543 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3545 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3547 self.report_extract_vid_page(video_page)
3550 # Extract video links on video page
3551 """Extract video links of all sizes"""
3552 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3553 mobj = re.findall(pattern, webpage)
3555 self._downloader.report_error(u'unable to extract video links')
3557 # Sort in resolution
3558 links = sorted(mobj)
3560 # Choose the lowest of the sort, i.e. highest resolution
3561 video_url = links[-1]
3562 # Only get the url. The resolution part in the tuple has no use anymore
3563 video_url = video_url[-1]
3564 # Treat escaped \u0026 style hex
3566 video_url = video_url.decode("unicode_escape")
3567 except AttributeError: # Python 3
3568 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3574 'uploader': uploader,
3575 'upload_date': upload_date,
3576 'title': video_title,
3577 'ext': video_extension,
3580 class NBAIE(InfoExtractor):
3581 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3584 def _real_extract(self, url):
3585 mobj = re.match(self._VALID_URL, url)
3587 self._downloader.report_error(u'invalid URL: %s' % url)
3590 video_id = mobj.group(1)
3591 if video_id.endswith('/index.html'):
3592 video_id = video_id[:-len('/index.html')]
3594 webpage = self._download_webpage(url, video_id)
3596 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3597 def _findProp(rexp, default=None):
3598 m = re.search(rexp, webpage)
3600 return unescapeHTML(m.group(1))
3604 shortened_video_id = video_id.rpartition('/')[2]
3605 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3607 'id': shortened_video_id,
3611 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3612 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3616 class JustinTVIE(InfoExtractor):
3617 """Information extractor for justin.tv and twitch.tv"""
3618 # TODO: One broadcast may be split into multiple videos. The key
3619 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3620 # starts at 1 and increases. Can we treat all parts as one video?
3622 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3623 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3624 _JUSTIN_PAGE_LIMIT = 100
3625 IE_NAME = u'justin.tv'
3627 def report_extraction(self, file_id):
3628 """Report information extraction."""
3629 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3631 def report_download_page(self, channel, offset):
3632 """Report attempt to download a single page of videos."""
3633 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3634 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3636 # Return count of items, list of *valid* items
3637 def _parse_page(self, url):
3639 urlh = compat_urllib_request.urlopen(url)
3640 webpage_bytes = urlh.read()
3641 webpage = webpage_bytes.decode('utf-8', 'ignore')
3642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3643 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3646 response = json.loads(webpage)
3647 if type(response) != list:
3648 error_text = response.get('error', 'unknown error')
3649 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3652 for clip in response:
3653 video_url = clip['video_file_url']
3655 video_extension = os.path.splitext(video_url)[1][1:]
3656 video_date = re.sub('-', '', clip['start_time'][:10])
3657 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3658 video_id = clip['id']
3659 video_title = clip.get('title', video_id)
3663 'title': video_title,
3664 'uploader': clip.get('channel_name', video_uploader_id),
3665 'uploader_id': video_uploader_id,
3666 'upload_date': video_date,
3667 'ext': video_extension,
3669 return (len(response), info)
3671 def _real_extract(self, url):
3672 mobj = re.match(self._VALID_URL, url)
3674 self._downloader.report_error(u'invalid URL: %s' % url)
3677 api = 'http://api.justin.tv'
3678 video_id = mobj.group(mobj.lastindex)
3680 if mobj.lastindex == 1:
3682 api += '/channel/archives/%s.json'
3684 api += '/broadcast/by_archive/%s.json'
3685 api = api % (video_id,)
3687 self.report_extraction(video_id)
3691 limit = self._JUSTIN_PAGE_LIMIT
3694 self.report_download_page(video_id, offset)
3695 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3696 page_count, page_info = self._parse_page(page_url)
3697 info.extend(page_info)
3698 if not paged or page_count != limit:
3703 class FunnyOrDieIE(InfoExtractor):
3704 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3706 def _real_extract(self, url):
3707 mobj = re.match(self._VALID_URL, url)
3709 self._downloader.report_error(u'invalid URL: %s' % url)
3712 video_id = mobj.group('id')
3713 webpage = self._download_webpage(url, video_id)
3715 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3717 self._downloader.report_error(u'unable to find video information')
3718 video_url = unescapeHTML(m.group('url'))
3720 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3722 self._downloader.trouble(u'Cannot find video title')
3723 title = clean_html(m.group('title'))
3725 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3727 desc = unescapeHTML(m.group('desc'))
3736 'description': desc,
3740 class SteamIE(InfoExtractor):
3741 _VALID_URL = r"""http://store.steampowered.com/
3742 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3744 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3748 def suitable(cls, url):
3749 """Receives a URL and returns True if suitable for this IE."""
3750 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3752 def _real_extract(self, url):
3753 m = re.match(self._VALID_URL, url, re.VERBOSE)
3754 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3755 gameID = m.group('gameID')
3756 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3757 webpage = self._download_webpage(videourl, gameID)
3758 mweb = re.finditer(urlRE, webpage)
3759 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3760 titles = re.finditer(namesRE, webpage)
3761 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3762 thumbs = re.finditer(thumbsRE, webpage)
3764 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3765 video_id = vid.group('videoID')
3766 title = vtitle.group('videoName')
3767 video_url = vid.group('videoURL')
3768 video_thumb = thumb.group('thumbnail')
3770 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3775 'title': unescapeHTML(title),
3776 'thumbnail': video_thumb
3781 class UstreamIE(InfoExtractor):
3782 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3783 IE_NAME = u'ustream'
3785 def _real_extract(self, url):
3786 m = re.match(self._VALID_URL, url)
3787 video_id = m.group('videoID')
3788 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3789 webpage = self._download_webpage(url, video_id)
3790 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3791 title = m.group('title')
3792 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3793 uploader = m.group('uploader')
3799 'uploader': uploader
3803 class WorldStarHipHopIE(InfoExtractor):
3804 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3805 IE_NAME = u'WorldStarHipHop'
3807 def _real_extract(self, url):
3808 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3810 webpage_src = compat_urllib_request.urlopen(url).read()
3811 webpage_src = webpage_src.decode('utf-8')
3813 mobj = re.search(_src_url, webpage_src)
3815 m = re.match(self._VALID_URL, url)
3816 video_id = m.group('id')
3818 if mobj is not None:
3819 video_url = mobj.group()
3820 if 'mp4' in video_url:
3825 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3828 _title = r"""<title>(.*)</title>"""
3830 mobj = re.search(_title, webpage_src)
3832 if mobj is not None:
3833 title = mobj.group(1)
3835 title = 'World Start Hip Hop - %s' % time.ctime()
3837 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3838 mobj = re.search(_thumbnail, webpage_src)
3840 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3841 if mobj is not None:
3842 thumbnail = mobj.group(1)
3844 _title = r"""candytitles.*>(.*)</span>"""
3845 mobj = re.search(_title, webpage_src)
3846 if mobj is not None:
3847 title = mobj.group(1)
3854 'thumbnail' : thumbnail,
3859 class RBMARadioIE(InfoExtractor):
3860 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3862 def _real_extract(self, url):
3863 m = re.match(self._VALID_URL, url)
3864 video_id = m.group('videoID')
3866 webpage = self._download_webpage(url, video_id)
3867 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3869 raise ExtractorError(u'Cannot find metadata')
3870 json_data = m.group(1)
3873 data = json.loads(json_data)
3874 except ValueError as e:
3875 raise ExtractorError(u'Invalid JSON: ' + str(e))
3877 video_url = data['akamai_url'] + '&cbr=256'
3878 url_parts = compat_urllib_parse_urlparse(video_url)
3879 video_ext = url_parts.path.rpartition('.')[2]
3884 'title': data['title'],
3885 'description': data.get('teaser_text'),
3886 'location': data.get('country_of_origin'),
3887 'uploader': data.get('host', {}).get('name'),
3888 'uploader_id': data.get('host', {}).get('slug'),
3889 'thumbnail': data.get('image', {}).get('large_url_2x'),
3890 'duration': data.get('duration'),
3895 class YouPornIE(InfoExtractor):
3896 """Information extractor for youporn.com."""
3897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3899 def _print_formats(self, formats):
3900 """Print all available formats"""
3901 print(u'Available formats:')
3902 print(u'ext\t\tformat')
3903 print(u'---------------------------------')
3904 for format in formats:
3905 print(u'%s\t\t%s' % (format['ext'], format['format']))
3907 def _specific(self, req_format, formats):
3909 if(x["format"]==req_format):
3913 def _real_extract(self, url):
3914 mobj = re.match(self._VALID_URL, url)
3916 self._downloader.report_error(u'invalid URL: %s' % url)
3919 video_id = mobj.group('videoid')
3921 req = compat_urllib_request.Request(url)
3922 req.add_header('Cookie', 'age_verified=1')
3923 webpage = self._download_webpage(req, video_id)
3925 # Get the video title
3926 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3928 raise ExtractorError(u'Unable to extract video title')
3929 video_title = result.group('title').strip()
3931 # Get the video date
3932 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3934 self._downloader.report_warning(u'unable to extract video date')
3937 upload_date = result.group('date').strip()
3939 # Get the video uploader
3940 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3942 self._downloader.report_warning(u'unable to extract uploader')
3943 video_uploader = None
3945 video_uploader = result.group('uploader').strip()
3946 video_uploader = clean_html( video_uploader )
3948 # Get all of the formats available
3949 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3950 result = re.search(DOWNLOAD_LIST_RE, webpage)
3952 raise ExtractorError(u'Unable to extract download list')
3953 download_list_html = result.group('download_list').strip()
3955 # Get all of the links from the page
3956 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3957 links = re.findall(LINK_RE, download_list_html)
3958 if(len(links) == 0):
3959 raise ExtractorError(u'ERROR: no known formats available for video')
3961 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3966 # A link looks like this:
3967 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3968 # A path looks like this:
3969 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3970 video_url = unescapeHTML( link )
3971 path = compat_urllib_parse_urlparse( video_url ).path
3972 extension = os.path.splitext( path )[1][1:]
3973 format = path.split('/')[4].split('_')[:2]
3976 format = "-".join( format )
3977 title = u'%s-%s-%s' % (video_title, size, bitrate)
3982 'uploader': video_uploader,
3983 'upload_date': upload_date,
3988 'description': None,
3992 if self._downloader.params.get('listformats', None):
3993 self._print_formats(formats)
3996 req_format = self._downloader.params.get('format', None)
3997 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3999 if req_format is None or req_format == 'best':
4001 elif req_format == 'worst':
4002 return [formats[-1]]
4003 elif req_format in ('-1', 'all'):
4006 format = self._specific( req_format, formats )
4008 self._downloader.report_error(u'requested format not available')
4014 class PornotubeIE(InfoExtractor):
4015 """Information extractor for pornotube.com."""
4016 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4018 def _real_extract(self, url):
4019 mobj = re.match(self._VALID_URL, url)
4021 self._downloader.report_error(u'invalid URL: %s' % url)
4024 video_id = mobj.group('videoid')
4025 video_title = mobj.group('title')
4027 # Get webpage content
4028 webpage = self._download_webpage(url, video_id)
4031 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4032 result = re.search(VIDEO_URL_RE, webpage)
4034 self._downloader.report_error(u'unable to extract video url')
4036 video_url = compat_urllib_parse.unquote(result.group('url'))
4038 #Get the uploaded date
4039 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4040 result = re.search(VIDEO_UPLOADED_RE, webpage)
4042 self._downloader.report_error(u'unable to extract video title')
4044 upload_date = result.group('date')
4046 info = {'id': video_id,
4049 'upload_date': upload_date,
4050 'title': video_title,
4056 class YouJizzIE(InfoExtractor):
4057 """Information extractor for youjizz.com."""
4058 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4060 def _real_extract(self, url):
4061 mobj = re.match(self._VALID_URL, url)
4063 self._downloader.report_error(u'invalid URL: %s' % url)
4066 video_id = mobj.group('videoid')
4068 # Get webpage content
4069 webpage = self._download_webpage(url, video_id)
4071 # Get the video title
4072 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4074 raise ExtractorError(u'ERROR: unable to extract video title')
4075 video_title = result.group('title').strip()
4077 # Get the embed page
4078 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4080 raise ExtractorError(u'ERROR: unable to extract embed page')
4082 embed_page_url = result.group(0).strip()
4083 video_id = result.group('videoid')
4085 webpage = self._download_webpage(embed_page_url, video_id)
4088 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4090 raise ExtractorError(u'ERROR: unable to extract video url')
4091 video_url = result.group('source')
4093 info = {'id': video_id,
4095 'title': video_title,
4098 'player_url': embed_page_url}
4102 class EightTracksIE(InfoExtractor):
4104 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4106 def _real_extract(self, url):
4107 mobj = re.match(self._VALID_URL, url)
4109 raise ExtractorError(u'Invalid URL: %s' % url)
4110 playlist_id = mobj.group('id')
4112 webpage = self._download_webpage(url, playlist_id)
4114 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4116 raise ExtractorError(u'Cannot find trax information')
4117 json_like = m.group(1)
4118 data = json.loads(json_like)
4120 session = str(random.randint(0, 1000000000))
4122 track_count = data['tracks_count']
4123 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4124 next_url = first_url
4126 for i in itertools.count():
4127 api_json = self._download_webpage(next_url, playlist_id,
4128 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4129 errnote=u'Failed to download song information')
4130 api_data = json.loads(api_json)
4131 track_data = api_data[u'set']['track']
4133 'id': track_data['id'],
4134 'url': track_data['track_file_stream_url'],
4135 'title': track_data['performer'] + u' - ' + track_data['name'],
4136 'raw_title': track_data['name'],
4137 'uploader_id': data['user']['login'],
4141 if api_data['set']['at_last_track']:
4143 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4146 class KeekIE(InfoExtractor):
4147 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4150 def _real_extract(self, url):
4151 m = re.match(self._VALID_URL, url)
4152 video_id = m.group('videoID')
4153 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4154 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4155 webpage = self._download_webpage(url, video_id)
4156 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4157 title = unescapeHTML(m.group('title'))
4158 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4159 uploader = clean_html(m.group('uploader'))
4165 'thumbnail': thumbnail,
4166 'uploader': uploader
4170 class TEDIE(InfoExtractor):
4171 _VALID_URL=r'''http://www.ted.com/
4173 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4175 ((?P<type_talk>talks)) # We have a simple talk
4177 /(?P<name>\w+) # Here goes the name and then ".html"
4181 def suitable(cls, url):
4182 """Receives a URL and returns True if suitable for this IE."""
4183 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4185 def _real_extract(self, url):
4186 m=re.match(self._VALID_URL, url, re.VERBOSE)
4187 if m.group('type_talk'):
4188 return [self._talk_info(url)]
4190 playlist_id=m.group('playlist_id')
4191 name=m.group('name')
4192 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4193 return self._playlist_videos_info(url,name,playlist_id)
4195 def _talk_video_link(self,mediaSlug):
4196 '''Returns the video link for that mediaSlug'''
4197 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4199 def _playlist_videos_info(self,url,name,playlist_id=0):
4200 '''Returns the videos of the playlist'''
4202 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4203 ([.\s]*?)data-playlist_item_id="(\d+)"
4204 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4206 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4207 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4208 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4209 m_names=re.finditer(video_name_RE,webpage)
4211 for m_video, m_name in zip(m_videos,m_names):
4212 video_id=m_video.group('video_id')
4213 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4214 info.append(self._talk_info(talk_url,video_id))
4217 def _talk_info(self, url, video_id=0):
4218 """Return the video for the talk in the url"""
4219 m=re.match(self._VALID_URL, url,re.VERBOSE)
4220 videoName=m.group('name')
4221 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4222 # If the url includes the language we get the title translated
4223 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4224 title=re.search(title_RE, webpage).group('title')
4225 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4226 "id":(?P<videoID>[\d]+).*?
4227 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4228 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4229 thumb_match=re.search(thumb_RE,webpage)
4230 info_match=re.search(info_RE,webpage,re.VERBOSE)
4231 video_id=info_match.group('videoID')
4232 mediaSlug=info_match.group('mediaSlug')
4233 video_url=self._talk_video_link(mediaSlug)
4239 'thumbnail': thumb_match.group('thumbnail')
4243 class MySpassIE(InfoExtractor):
4244 _VALID_URL = r'http://www.myspass.de/.*'
4246 def _real_extract(self, url):
4247 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4249 # video id is the last path element of the URL
4250 # usually there is a trailing slash, so also try the second but last
4251 url_path = compat_urllib_parse_urlparse(url).path
4252 url_parent_path, video_id = os.path.split(url_path)
4254 _, video_id = os.path.split(url_parent_path)
4257 metadata_url = META_DATA_URL_TEMPLATE % video_id
4258 metadata_text = self._download_webpage(metadata_url, video_id)
4259 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4261 # extract values from metadata
4262 url_flv_el = metadata.find('url_flv')
4263 if url_flv_el is None:
4264 self._downloader.report_error(u'unable to extract download url')
4266 video_url = url_flv_el.text
4267 extension = os.path.splitext(video_url)[1][1:]
4268 title_el = metadata.find('title')
4269 if title_el is None:
4270 self._downloader.report_error(u'unable to extract title')
4272 title = title_el.text
4273 format_id_el = metadata.find('format_id')
4274 if format_id_el is None:
4277 format = format_id_el.text
4278 description_el = metadata.find('description')
4279 if description_el is not None:
4280 description = description_el.text
4283 imagePreview_el = metadata.find('imagePreview')
4284 if imagePreview_el is not None:
4285 thumbnail = imagePreview_el.text
4294 'thumbnail': thumbnail,
4295 'description': description
4299 class SpiegelIE(InfoExtractor):
4300 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4302 def _real_extract(self, url):
4303 m = re.match(self._VALID_URL, url)
4304 video_id = m.group('videoID')
4306 webpage = self._download_webpage(url, video_id)
4307 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4309 raise ExtractorError(u'Cannot find title')
4310 video_title = unescapeHTML(m.group(1))
4312 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4313 xml_code = self._download_webpage(xml_url, video_id,
4314 note=u'Downloading XML', errnote=u'Failed to download XML')
4316 idoc = xml.etree.ElementTree.fromstring(xml_code)
4317 last_type = idoc[-1]
4318 filename = last_type.findall('./filename')[0].text
4319 duration = float(last_type.findall('./duration')[0].text)
4321 video_url = 'http://video2.spiegel.de/flash/' + filename
4322 video_ext = filename.rpartition('.')[2]
4327 'title': video_title,
4328 'duration': duration,
4332 class LiveLeakIE(InfoExtractor):
4334 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4335 IE_NAME = u'liveleak'
4337 def _real_extract(self, url):
4338 mobj = re.match(self._VALID_URL, url)
4340 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4343 video_id = mobj.group('video_id')
4345 webpage = self._download_webpage(url, video_id)
4347 m = re.search(r'file: "(.*?)",', webpage)
4349 self._downloader.report_error(u'unable to find video url')
4351 video_url = m.group(1)
4353 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4355 self._downloader.trouble(u'Cannot find video title')
4356 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4358 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4360 desc = unescapeHTML(m.group('desc'))
4364 m = re.search(r'By:.*?(\w+)</a>', webpage)
4366 uploader = clean_html(m.group(1))
4375 'description': desc,
4376 'uploader': uploader
4382 def gen_extractors():
4383 """ Return a list of an instance of every supported extractor.
4384 The order does matter; the first extractor matched is the one handling the URL.
4387 YoutubePlaylistIE(),
4412 StanfordOpenClassroomIE(),
4422 WorldStarHipHopIE(),