2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 class SearchInfoExtractor(InfoExtractor):
196 Base class for paged search queries extractors.
197 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198 Instances should define _SEARCH_KEY and _MAX_RESULTS.
202 def _make_valid_url(cls):
203 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
206 def suitable(cls, url):
207 return re.match(cls._make_valid_url(), url) is not None
209 def _real_extract(self, query):
210 mobj = re.match(self._make_valid_url(), query)
212 raise ExtractorError(u'Invalid search query "%s"' % query)
214 prefix = mobj.group('prefix')
215 query = mobj.group('query')
217 return self._get_n_results(query, 1)
218 elif prefix == 'all':
219 return self._get_n_results(query, self._MAX_RESULTS)
223 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224 elif n > self._MAX_RESULTS:
225 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226 n = self._MAX_RESULTS
227 return self._get_n_results(query, n)
229 def _get_n_results(self, query, n):
230 """Get a specified number of results for a query"""
231 raise NotImplementedError("This method must be implemented by sublclasses")
234 class YoutubeIE(InfoExtractor):
235 """Information extractor for youtube.com."""
239 (?:https?://)? # http(s):// (optional)
240 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
242 (?:.*?\#/)? # handle anchor (#/) redirect urls
243 (?: # the various things that can precede the ID:
244 (?:(?:v|embed|e)/) # v/ or embed/ or e/
245 |(?: # or the v= param in all its forms
246 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247 (?:\?|\#!?) # the params delimiter ? or # or #!
248 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
251 )? # optional -> youtube.com/xxxx is OK
252 )? # all until now is optional -> you can pass the naked ID
253 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
254 (?(1).+)? # if we found the ID, everything can follow
256 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260 _NETRC_MACHINE = 'youtube'
261 # Listed in order of quality
262 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264 _video_extensions = {
270 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
276 _video_dimensions = {
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
297 if YoutubePlaylistIE.suitable(url): return False
298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
300 def report_lang(self):
301 """Report attempt to set language."""
302 self.to_screen(u'Setting language')
304 def report_login(self):
305 """Report attempt to log in."""
306 self.to_screen(u'Logging in')
308 def report_video_webpage_download(self, video_id):
309 """Report attempt to download video webpage."""
310 self.to_screen(u'%s: Downloading video webpage' % video_id)
312 def report_video_info_webpage_download(self, video_id):
313 """Report attempt to download video info webpage."""
314 self.to_screen(u'%s: Downloading video info webpage' % video_id)
316 def report_video_subtitles_download(self, video_id):
317 """Report attempt to download video info webpage."""
318 self.to_screen(u'%s: Checking available subtitles' % video_id)
320 def report_video_subtitles_request(self, video_id, sub_lang, format):
321 """Report attempt to download video info webpage."""
322 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
324 def report_video_subtitles_available(self, video_id, sub_lang_list):
325 """Report available subtitles."""
326 sub_lang = ",".join(list(sub_lang_list.keys()))
327 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
329 def report_information_extraction(self, video_id):
330 """Report attempt to extract video information."""
331 self.to_screen(u'%s: Extracting video information' % video_id)
333 def report_unavailable_format(self, video_id, format):
334 """Report extracted video URL."""
335 self.to_screen(u'%s: Format %s not available' % (video_id, format))
337 def report_rtmp_download(self):
338 """Indicate the download will use the RTMP protocol."""
339 self.to_screen(u'RTMP download detected')
341 def _get_available_subtitles(self, video_id):
342 self.report_video_subtitles_download(video_id)
343 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
345 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 return (u'unable to download video subtitles: %s' % compat_str(err), None)
348 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350 if not sub_lang_list:
351 return (u'video doesn\'t have subtitles', None)
354 def _list_available_subtitles(self, video_id):
355 sub_lang_list = self._get_available_subtitles(video_id)
356 self.report_video_subtitles_available(video_id, sub_lang_list)
358 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
361 (error_message, sub_lang, sub)
363 self.report_video_subtitles_request(video_id, sub_lang, format)
364 params = compat_urllib_parse.urlencode({
370 url = 'http://www.youtube.com/api/timedtext?' + params
372 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
376 return (u'Did not fetch video subtitles', None, None)
377 return (None, sub_lang, sub)
379 def _extract_subtitle(self, video_id):
381 Return a list with a tuple:
382 [(error_message, sub_lang, sub)]
384 sub_lang_list = self._get_available_subtitles(video_id)
385 sub_format = self._downloader.params.get('subtitlesformat')
386 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
387 return [(sub_lang_list[0], None, None)]
388 if self._downloader.params.get('subtitleslang', False):
389 sub_lang = self._downloader.params.get('subtitleslang')
390 elif 'en' in sub_lang_list:
393 sub_lang = list(sub_lang_list.keys())[0]
394 if not sub_lang in sub_lang_list:
395 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
397 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
400 def _extract_all_subtitles(self, video_id):
401 sub_lang_list = self._get_available_subtitles(video_id)
402 sub_format = self._downloader.params.get('subtitlesformat')
403 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
404 return [(sub_lang_list[0], None, None)]
406 for sub_lang in sub_lang_list:
407 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
408 subtitles.append(subtitle)
411 def _print_formats(self, formats):
412 print('Available formats:')
414 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
416 def _real_initialize(self):
417 if self._downloader is None:
422 downloader_params = self._downloader.params
424 # Attempt to use provided username and password or .netrc data
425 if downloader_params.get('username', None) is not None:
426 username = downloader_params['username']
427 password = downloader_params['password']
428 elif downloader_params.get('usenetrc', False):
430 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
435 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
436 except (IOError, netrc.NetrcParseError) as err:
437 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
441 request = compat_urllib_request.Request(self._LANG_URL)
444 compat_urllib_request.urlopen(request).read()
445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
446 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
449 # No authentication to be performed
453 request = compat_urllib_request.Request(self._LOGIN_URL)
455 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
462 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
464 galx = match.group(1)
466 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
472 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
476 u'PersistentCookie': u'yes',
478 u'bgresponse': u'js_disabled',
479 u'checkConnection': u'',
480 u'checkedDomains': u'youtube',
486 u'signIn': u'Sign in',
488 u'service': u'youtube',
492 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
494 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
495 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
496 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
499 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
500 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
501 self._downloader.report_warning(u'unable to log in: bad username or password')
503 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
504 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
510 'action_confirm': 'Confirm',
512 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
514 self.report_age_confirmation()
515 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
516 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
517 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
519 def _extract_id(self, url):
520 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
522 raise ExtractorError(u'Invalid URL: %s' % url)
523 video_id = mobj.group(2)
526 def _real_extract(self, url):
527 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
528 mobj = re.search(self._NEXT_URL_RE, url)
530 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
531 video_id = self._extract_id(url)
534 self.report_video_webpage_download(video_id)
535 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
536 request = compat_urllib_request.Request(url)
538 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
539 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
540 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
542 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
544 # Attempt to extract SWF player URL
545 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
547 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
552 self.report_video_info_webpage_download(video_id)
553 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
554 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
555 % (video_id, el_type))
556 video_info_webpage = self._download_webpage(video_info_url, video_id,
558 errnote='unable to download video info webpage')
559 video_info = compat_parse_qs(video_info_webpage)
560 if 'token' in video_info:
562 if 'token' not in video_info:
563 if 'reason' in video_info:
564 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
566 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
568 # Check for "rental" videos
569 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
570 raise ExtractorError(u'"rental" videos not supported')
572 # Start extracting information
573 self.report_information_extraction(video_id)
576 if 'author' not in video_info:
577 raise ExtractorError(u'Unable to extract uploader name')
578 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
581 video_uploader_id = None
582 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
584 video_uploader_id = mobj.group(1)
586 self._downloader.report_warning(u'unable to extract uploader nickname')
589 if 'title' not in video_info:
590 raise ExtractorError(u'Unable to extract video title')
591 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
594 if 'thumbnail_url' not in video_info:
595 self._downloader.report_warning(u'unable to extract video thumbnail')
597 else: # don't panic if we can't find it
598 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
602 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
604 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
605 upload_date = unified_strdate(upload_date)
608 video_description = get_element_by_id("eow-description", video_webpage)
609 if video_description:
610 video_description = clean_html(video_description)
612 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
614 video_description = unescapeHTML(fd_mobj.group(1))
616 video_description = u''
619 video_subtitles = None
621 if self._downloader.params.get('writesubtitles', False):
622 video_subtitles = self._extract_subtitle(video_id)
624 (sub_error, sub_lang, sub) = video_subtitles[0]
626 self._downloader.report_error(sub_error)
628 if self._downloader.params.get('allsubtitles', False):
629 video_subtitles = self._extract_all_subtitles(video_id)
630 for video_subtitle in video_subtitles:
631 (sub_error, sub_lang, sub) = video_subtitle
633 self._downloader.report_error(sub_error)
635 if self._downloader.params.get('listsubtitles', False):
636 sub_lang_list = self._list_available_subtitles(video_id)
639 if 'length_seconds' not in video_info:
640 self._downloader.report_warning(u'unable to extract video duration')
643 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
646 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
648 # Decide which formats to download
649 req_format = self._downloader.params.get('format', None)
651 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
652 self.report_rtmp_download()
653 video_url_list = [(None, video_info['conn'][0])]
654 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
656 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
657 url_data = compat_parse_qs(url_data_str)
658 if 'itag' in url_data and 'url' in url_data:
659 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
660 if not 'ratebypass' in url: url += '&ratebypass=yes'
661 url_map[url_data['itag'][0]] = url
663 format_limit = self._downloader.params.get('format_limit', None)
664 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
665 if format_limit is not None and format_limit in available_formats:
666 format_list = available_formats[available_formats.index(format_limit):]
668 format_list = available_formats
669 existing_formats = [x for x in format_list if x in url_map]
670 if len(existing_formats) == 0:
671 raise ExtractorError(u'no known formats available for video')
672 if self._downloader.params.get('listformats', None):
673 self._print_formats(existing_formats)
675 if req_format is None or req_format == 'best':
676 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
677 elif req_format == 'worst':
678 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
679 elif req_format in ('-1', 'all'):
680 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
682 # Specific formats. We pick the first in a slash-delimeted sequence.
683 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
684 req_formats = req_format.split('/')
685 video_url_list = None
686 for rf in req_formats:
688 video_url_list = [(rf, url_map[rf])]
690 if video_url_list is None:
691 raise ExtractorError(u'requested format not available')
693 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
696 for format_param, video_real_url in video_url_list:
698 video_extension = self._video_extensions.get(format_param, 'flv')
700 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
701 self._video_dimensions.get(format_param, '???'))
705 'url': video_real_url,
706 'uploader': video_uploader,
707 'uploader_id': video_uploader_id,
708 'upload_date': upload_date,
709 'title': video_title,
710 'ext': video_extension,
711 'format': video_format,
712 'thumbnail': video_thumbnail,
713 'description': video_description,
714 'player_url': player_url,
715 'subtitles': video_subtitles,
716 'duration': video_duration
721 class MetacafeIE(InfoExtractor):
722 """Information Extractor for metacafe.com."""
724 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
725 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
726 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
727 IE_NAME = u'metacafe'
729 def report_disclaimer(self):
730 """Report disclaimer retrieval."""
731 self.to_screen(u'Retrieving disclaimer')
733 def _real_initialize(self):
734 # Retrieve disclaimer
735 request = compat_urllib_request.Request(self._DISCLAIMER)
737 self.report_disclaimer()
738 disclaimer = compat_urllib_request.urlopen(request).read()
739 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
740 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
745 'submit': "Continue - I'm over 18",
747 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
749 self.report_age_confirmation()
750 disclaimer = compat_urllib_request.urlopen(request).read()
751 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
752 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
754 def _real_extract(self, url):
755 # Extract id and simplified title from URL
756 mobj = re.match(self._VALID_URL, url)
758 raise ExtractorError(u'Invalid URL: %s' % url)
760 video_id = mobj.group(1)
762 # Check if video comes from YouTube
763 mobj2 = re.match(r'^yt-(.*)$', video_id)
764 if mobj2 is not None:
765 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
767 # Retrieve video webpage to extract further information
768 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
770 # Extract URL, uploader and title from webpage
771 self.report_extraction(video_id)
772 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
774 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
775 video_extension = mediaURL[-3:]
777 # Extract gdaKey if available
778 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
782 gdaKey = mobj.group(1)
783 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
785 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
787 raise ExtractorError(u'Unable to extract media URL')
788 vardict = compat_parse_qs(mobj.group(1))
789 if 'mediaData' not in vardict:
790 raise ExtractorError(u'Unable to extract media URL')
791 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
793 raise ExtractorError(u'Unable to extract media URL')
794 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
795 video_extension = mediaURL[-3:]
796 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
798 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
800 raise ExtractorError(u'Unable to extract title')
801 video_title = mobj.group(1).decode('utf-8')
803 mobj = re.search(r'submitter=(.*?);', webpage)
805 raise ExtractorError(u'Unable to extract uploader nickname')
806 video_uploader = mobj.group(1)
809 'id': video_id.decode('utf-8'),
810 'url': video_url.decode('utf-8'),
811 'uploader': video_uploader.decode('utf-8'),
813 'title': video_title,
814 'ext': video_extension.decode('utf-8'),
817 class DailymotionIE(InfoExtractor):
818 """Information Extractor for Dailymotion"""
820 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
821 IE_NAME = u'dailymotion'
823 def _real_extract(self, url):
824 # Extract id and simplified title from URL
825 mobj = re.match(self._VALID_URL, url)
827 raise ExtractorError(u'Invalid URL: %s' % url)
829 video_id = mobj.group(1).split('_')[0].split('?')[0]
831 video_extension = 'mp4'
833 # Retrieve video webpage to extract further information
834 request = compat_urllib_request.Request(url)
835 request.add_header('Cookie', 'family_filter=off')
836 webpage = self._download_webpage(request, video_id)
838 # Extract URL, uploader and title from webpage
839 self.report_extraction(video_id)
840 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
842 raise ExtractorError(u'Unable to extract media URL')
843 flashvars = compat_urllib_parse.unquote(mobj.group(1))
845 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
848 self.to_screen(u'Using %s' % key)
851 raise ExtractorError(u'Unable to extract video URL')
853 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
855 raise ExtractorError(u'Unable to extract video URL')
857 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
859 # TODO: support choosing qualities
861 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
863 raise ExtractorError(u'Unable to extract title')
864 video_title = unescapeHTML(mobj.group('title'))
866 video_uploader = None
867 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
869 # lookin for official user
870 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
871 if mobj_official is None:
872 self._downloader.report_warning(u'unable to extract uploader nickname')
874 video_uploader = mobj_official.group(1)
876 video_uploader = mobj.group(1)
878 video_upload_date = None
879 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
881 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
886 'uploader': video_uploader,
887 'upload_date': video_upload_date,
888 'title': video_title,
889 'ext': video_extension,
893 class PhotobucketIE(InfoExtractor):
894 """Information extractor for photobucket.com."""
896 # TODO: the original _VALID_URL was:
897 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
898 # Check if it's necessary to keep the old extracion process
899 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
900 IE_NAME = u'photobucket'
902 def _real_extract(self, url):
903 # Extract id from URL
904 mobj = re.match(self._VALID_URL, url)
906 raise ExtractorError(u'Invalid URL: %s' % url)
908 video_id = mobj.group('id')
910 video_extension = mobj.group('ext')
912 # Retrieve video webpage to extract further information
913 webpage = self._download_webpage(url, video_id)
915 # Extract URL, uploader, and title from webpage
916 self.report_extraction(video_id)
917 # We try first by looking the javascript code:
918 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
920 info = json.loads(mobj.group('json'))
923 'url': info[u'downloadUrl'],
924 'uploader': info[u'username'],
925 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
926 'title': info[u'title'],
927 'ext': video_extension,
928 'thumbnail': info[u'thumbUrl'],
931 # We try looking in other parts of the webpage
932 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
934 raise ExtractorError(u'Unable to extract media URL')
935 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
939 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
941 raise ExtractorError(u'Unable to extract title')
942 video_title = mobj.group(1).decode('utf-8')
944 video_uploader = mobj.group(2).decode('utf-8')
947 'id': video_id.decode('utf-8'),
948 'url': video_url.decode('utf-8'),
949 'uploader': video_uploader,
951 'title': video_title,
952 'ext': video_extension.decode('utf-8'),
956 class YahooIE(InfoExtractor):
957 """Information extractor for screen.yahoo.com."""
958 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
960 def _real_extract(self, url):
961 mobj = re.match(self._VALID_URL, url)
963 raise ExtractorError(u'Invalid URL: %s' % url)
964 video_id = mobj.group('id')
965 webpage = self._download_webpage(url, video_id)
966 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
969 # TODO: Check which url parameters are required
970 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
971 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
972 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
973 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
974 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
975 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
977 self.report_extraction(video_id)
978 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
980 raise ExtractorError(u'Unable to extract video info')
981 video_title = m_info.group('title')
982 video_description = m_info.group('description')
983 video_thumb = m_info.group('thumb')
984 video_date = m_info.group('date')
985 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
987 # TODO: Find a way to get mp4 videos
988 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
989 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
990 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
991 video_url = m_rest.group('url')
992 video_path = m_rest.group('path')
994 raise ExtractorError(u'Unable to extract video url')
996 else: # We have to use a different method if another id is defined
997 long_id = m_id.group('new_id')
998 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
999 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1000 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1001 info = json.loads(json_str)
1002 res = info[u'query'][u'results'][u'mediaObj'][0]
1003 stream = res[u'streams'][0]
1004 video_path = stream[u'path']
1005 video_url = stream[u'host']
1007 video_title = meta[u'title']
1008 video_description = meta[u'description']
1009 video_thumb = meta[u'thumbnail']
1010 video_date = None # I can't find it
1015 'play_path': video_path,
1016 'title':video_title,
1017 'description': video_description,
1018 'thumbnail': video_thumb,
1019 'upload_date': video_date,
1024 class VimeoIE(InfoExtractor):
1025 """Information extractor for vimeo.com."""
1027 # _VALID_URL matches Vimeo URLs
1028 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1031 def _real_extract(self, url, new_video=True):
1032 # Extract ID from URL
1033 mobj = re.match(self._VALID_URL, url)
1035 raise ExtractorError(u'Invalid URL: %s' % url)
1037 video_id = mobj.group('id')
1038 if not mobj.group('proto'):
1039 url = 'https://' + url
1040 if mobj.group('direct_link'):
1041 url = 'https://vimeo.com/' + video_id
1043 # Retrieve video webpage to extract further information
1044 request = compat_urllib_request.Request(url, None, std_headers)
1045 webpage = self._download_webpage(request, video_id)
1047 # Now we begin extracting as much information as we can from what we
1048 # retrieved. First we extract the information common to all extractors,
1049 # and latter we extract those that are Vimeo specific.
1050 self.report_extraction(video_id)
1052 # Extract the config JSON
1054 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1055 config = json.loads(config)
1057 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1058 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1060 raise ExtractorError(u'Unable to extract info section')
1063 video_title = config["video"]["title"]
1065 # Extract uploader and uploader_id
1066 video_uploader = config["video"]["owner"]["name"]
1067 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1069 # Extract video thumbnail
1070 video_thumbnail = config["video"]["thumbnail"]
1072 # Extract video description
1073 video_description = get_element_by_attribute("itemprop", "description", webpage)
1074 if video_description: video_description = clean_html(video_description)
1075 else: video_description = u''
1077 # Extract upload date
1078 video_upload_date = None
1079 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1080 if mobj is not None:
1081 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1083 # Vimeo specific: extract request signature and timestamp
1084 sig = config['request']['signature']
1085 timestamp = config['request']['timestamp']
1087 # Vimeo specific: extract video codec and quality information
1088 # First consider quality, then codecs, then take everything
1089 # TODO bind to format param
1090 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1091 files = { 'hd': [], 'sd': [], 'other': []}
1092 for codec_name, codec_extension in codecs:
1093 if codec_name in config["video"]["files"]:
1094 if 'hd' in config["video"]["files"][codec_name]:
1095 files['hd'].append((codec_name, codec_extension, 'hd'))
1096 elif 'sd' in config["video"]["files"][codec_name]:
1097 files['sd'].append((codec_name, codec_extension, 'sd'))
1099 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1101 for quality in ('hd', 'sd', 'other'):
1102 if len(files[quality]) > 0:
1103 video_quality = files[quality][0][2]
1104 video_codec = files[quality][0][0]
1105 video_extension = files[quality][0][1]
1106 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1109 raise ExtractorError(u'No known codec found')
1111 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1117 'uploader': video_uploader,
1118 'uploader_id': video_uploader_id,
1119 'upload_date': video_upload_date,
1120 'title': video_title,
1121 'ext': video_extension,
1122 'thumbnail': video_thumbnail,
1123 'description': video_description,
1127 class ArteTvIE(InfoExtractor):
1128 """arte.tv information extractor."""
1130 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1131 _LIVE_URL = r'index-[0-9]+\.html$'
1133 IE_NAME = u'arte.tv'
1135 def fetch_webpage(self, url):
1136 request = compat_urllib_request.Request(url)
1138 self.report_download_webpage(url)
1139 webpage = compat_urllib_request.urlopen(request).read()
1140 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1141 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1142 except ValueError as err:
1143 raise ExtractorError(u'Invalid URL: %s' % url)
1146 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1147 page = self.fetch_webpage(url)
1148 mobj = re.search(regex, page, regexFlags)
1152 raise ExtractorError(u'Invalid URL: %s' % url)
1154 for (i, key, err) in matchTuples:
1155 if mobj.group(i) is None:
1156 raise ExtractorError(err)
1158 info[key] = mobj.group(i)
1162 def extractLiveStream(self, url):
1163 video_lang = url.split('/')[-4]
1164 info = self.grep_webpage(
1166 r'src="(.*?/videothek_js.*?\.js)',
1169 (1, 'url', u'Invalid URL: %s' % url)
1172 http_host = url.split('/')[2]
1173 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1174 info = self.grep_webpage(
1176 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1177 '(http://.*?\.swf).*?' +
1181 (1, 'path', u'could not extract video path: %s' % url),
1182 (2, 'player', u'could not extract video player: %s' % url),
1183 (3, 'url', u'could not extract video url: %s' % url)
1186 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1188 def extractPlus7Stream(self, url):
1189 video_lang = url.split('/')[-3]
1190 info = self.grep_webpage(
1192 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1195 (1, 'url', u'Invalid URL: %s' % url)
1198 next_url = compat_urllib_parse.unquote(info.get('url'))
1199 info = self.grep_webpage(
1201 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1204 (1, 'url', u'Could not find <video> tag: %s' % url)
1207 next_url = compat_urllib_parse.unquote(info.get('url'))
1209 info = self.grep_webpage(
1211 r'<video id="(.*?)".*?>.*?' +
1212 '<name>(.*?)</name>.*?' +
1213 '<dateVideo>(.*?)</dateVideo>.*?' +
1214 '<url quality="hd">(.*?)</url>',
1217 (1, 'id', u'could not extract video id: %s' % url),
1218 (2, 'title', u'could not extract video title: %s' % url),
1219 (3, 'date', u'could not extract video date: %s' % url),
1220 (4, 'url', u'could not extract video url: %s' % url)
1225 'id': info.get('id'),
1226 'url': compat_urllib_parse.unquote(info.get('url')),
1227 'uploader': u'arte.tv',
1228 'upload_date': unified_strdate(info.get('date')),
1229 'title': info.get('title').decode('utf-8'),
1235 def _real_extract(self, url):
1236 video_id = url.split('/')[-1]
1237 self.report_extraction(video_id)
1239 if re.search(self._LIVE_URL, video_id) is not None:
1240 self.extractLiveStream(url)
1243 info = self.extractPlus7Stream(url)
1248 class GenericIE(InfoExtractor):
1249 """Generic last-resort information extractor."""
1252 IE_NAME = u'generic'
1254 def report_download_webpage(self, video_id):
1255 """Report webpage download."""
1256 if not self._downloader.params.get('test', False):
1257 self._downloader.report_warning(u'Falling back on generic information extractor.')
1258 super(GenericIE, self).report_download_webpage(video_id)
1260 def report_following_redirect(self, new_url):
1261 """Report information extraction."""
1262 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1264 def _test_redirect(self, url):
1265 """Check if it is a redirect, like url shorteners, in case return the new url."""
1266 class HeadRequest(compat_urllib_request.Request):
1267 def get_method(self):
1270 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1272 Subclass the HTTPRedirectHandler to make it use our
1273 HeadRequest also on the redirected URL
1275 def redirect_request(self, req, fp, code, msg, headers, newurl):
1276 if code in (301, 302, 303, 307):
1277 newurl = newurl.replace(' ', '%20')
1278 newheaders = dict((k,v) for k,v in req.headers.items()
1279 if k.lower() not in ("content-length", "content-type"))
1280 return HeadRequest(newurl,
1282 origin_req_host=req.get_origin_req_host(),
1285 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1287 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1289 Fallback to GET if HEAD is not allowed (405 HTTP error)
1291 def http_error_405(self, req, fp, code, msg, headers):
1295 newheaders = dict((k,v) for k,v in req.headers.items()
1296 if k.lower() not in ("content-length", "content-type"))
1297 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1299 origin_req_host=req.get_origin_req_host(),
1303 opener = compat_urllib_request.OpenerDirector()
1304 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1305 HTTPMethodFallback, HEADRedirectHandler,
1306 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1307 opener.add_handler(handler())
1309 response = opener.open(HeadRequest(url))
1310 if response is None:
1311 raise ExtractorError(u'Invalid URL protocol')
1312 new_url = response.geturl()
1317 self.report_following_redirect(new_url)
1320 def _real_extract(self, url):
1321 new_url = self._test_redirect(url)
1322 if new_url: return [self.url_result(new_url)]
1324 video_id = url.split('/')[-1]
1326 webpage = self._download_webpage(url, video_id)
1327 except ValueError as err:
1328 # since this is the last-resort InfoExtractor, if
1329 # this error is thrown, it'll be thrown here
1330 raise ExtractorError(u'Invalid URL: %s' % url)
1332 self.report_extraction(video_id)
1333 # Start with something easy: JW Player in SWFObject
1334 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1336 # Broaden the search a little bit
1337 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1339 # Broaden the search a little bit: JWPlayer JS loader
1340 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1342 raise ExtractorError(u'Invalid URL: %s' % url)
1344 # It's possible that one of the regexes
1345 # matched, but returned an empty group:
1346 if mobj.group(1) is None:
1347 raise ExtractorError(u'Invalid URL: %s' % url)
1349 video_url = compat_urllib_parse.unquote(mobj.group(1))
1350 video_id = os.path.basename(video_url)
1352 # here's a fun little line of code for you:
1353 video_extension = os.path.splitext(video_id)[1][1:]
1354 video_id = os.path.splitext(video_id)[0]
1356 # it's tempting to parse this further, but you would
1357 # have to take into account all the variations like
1358 # Video Title - Site Name
1359 # Site Name | Video Title
1360 # Video Title - Tagline | Site Name
1361 # and so on and so forth; it's just not practical
1362 mobj = re.search(r'<title>(.*)</title>', webpage)
1364 raise ExtractorError(u'Unable to extract title')
1365 video_title = mobj.group(1)
1367 # video uploader is domain name
1368 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1370 raise ExtractorError(u'Unable to extract title')
1371 video_uploader = mobj.group(1)
1376 'uploader': video_uploader,
1377 'upload_date': None,
1378 'title': video_title,
1379 'ext': video_extension,
1383 class YoutubeSearchIE(SearchInfoExtractor):
1384 """Information Extractor for YouTube search queries."""
1385 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1387 IE_NAME = u'youtube:search'
1388 _SEARCH_KEY = 'ytsearch'
1390 def report_download_page(self, query, pagenum):
1391 """Report attempt to download search page with given number."""
1392 query = query.decode(preferredencoding())
1393 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1395 def _get_n_results(self, query, n):
1396 """Get a specified number of results for a query"""
1402 while (50 * pagenum) < limit:
1403 self.report_download_page(query, pagenum+1)
1404 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1405 request = compat_urllib_request.Request(result_url)
1407 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1409 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1410 api_response = json.loads(data)['data']
1412 if not 'items' in api_response:
1413 raise ExtractorError(u'[youtube] No video results')
1415 new_ids = list(video['id'] for video in api_response['items'])
1416 video_ids += new_ids
1418 limit = min(n, api_response['totalItems'])
1421 if len(video_ids) > n:
1422 video_ids = video_ids[:n]
1423 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1424 return self.playlist_result(videos, query)
1427 class GoogleSearchIE(SearchInfoExtractor):
1428 """Information Extractor for Google Video search queries."""
1429 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1431 IE_NAME = u'video.google:search'
1432 _SEARCH_KEY = 'gvsearch'
1434 def _get_n_results(self, query, n):
1435 """Get a specified number of results for a query"""
1438 '_type': 'playlist',
1443 for pagenum in itertools.count(1):
1444 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1445 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1446 note='Downloading result page ' + str(pagenum))
1448 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1451 'url': mobj.group(1)
1453 res['entries'].append(e)
1455 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1458 class YahooSearchIE(SearchInfoExtractor):
1459 """Information Extractor for Yahoo! Video search queries."""
1462 IE_NAME = u'screen.yahoo:search'
1463 _SEARCH_KEY = 'yvsearch'
1465 def _get_n_results(self, query, n):
1466 """Get a specified number of results for a query"""
1469 '_type': 'playlist',
1473 for pagenum in itertools.count(0):
1474 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1475 webpage = self._download_webpage(result_url, query,
1476 note='Downloading results page '+str(pagenum+1))
1477 info = json.loads(webpage)
1479 results = info[u'results']
1481 for (i, r) in enumerate(results):
1482 if (pagenum * 30) +i >= n:
1484 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1485 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1486 res['entries'].append(e)
1487 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1493 class YoutubePlaylistIE(InfoExtractor):
1494 """Information Extractor for YouTube playlists."""
1496 _VALID_URL = r"""(?:
1501 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1502 \? (?:.*?&)*? (?:p|a|list)=
1505 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1508 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1510 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1512 IE_NAME = u'youtube:playlist'
1515 def suitable(cls, url):
1516 """Receives a URL and returns True if suitable for this IE."""
1517 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1519 def _real_extract(self, url):
1520 # Extract playlist id
1521 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1523 raise ExtractorError(u'Invalid URL: %s' % url)
1525 # Download playlist videos from API
1526 playlist_id = mobj.group(1) or mobj.group(2)
1531 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1532 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1535 response = json.loads(page)
1536 except ValueError as err:
1537 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1539 if 'feed' not in response:
1540 raise ExtractorError(u'Got a malformed response from YouTube API')
1541 playlist_title = response['feed']['title']['$t']
1542 if 'entry' not in response['feed']:
1543 # Number of videos is a multiple of self._MAX_RESULTS
1546 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1547 for entry in response['feed']['entry']
1548 if 'content' in entry ]
1550 if len(response['feed']['entry']) < self._MAX_RESULTS:
1554 videos = [v[1] for v in sorted(videos)]
1556 url_results = [self.url_result(url, 'Youtube') for url in videos]
1557 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1560 class YoutubeChannelIE(InfoExtractor):
1561 """Information Extractor for YouTube channels."""
1563 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1564 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1565 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1566 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1567 IE_NAME = u'youtube:channel'
1569 def extract_videos_from_page(self, page):
1571 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1572 if mobj.group(1) not in ids_in_page:
1573 ids_in_page.append(mobj.group(1))
1576 def _real_extract(self, url):
1577 # Extract channel id
1578 mobj = re.match(self._VALID_URL, url)
1580 raise ExtractorError(u'Invalid URL: %s' % url)
1582 # Download channel page
1583 channel_id = mobj.group(1)
1587 url = self._TEMPLATE_URL % (channel_id, pagenum)
1588 page = self._download_webpage(url, channel_id,
1589 u'Downloading page #%s' % pagenum)
1591 # Extract video identifiers
1592 ids_in_page = self.extract_videos_from_page(page)
1593 video_ids.extend(ids_in_page)
1595 # Download any subsequent channel pages using the json-based channel_ajax query
1596 if self._MORE_PAGES_INDICATOR in page:
1598 pagenum = pagenum + 1
1600 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1601 page = self._download_webpage(url, channel_id,
1602 u'Downloading page #%s' % pagenum)
1604 page = json.loads(page)
1606 ids_in_page = self.extract_videos_from_page(page['content_html'])
1607 video_ids.extend(ids_in_page)
1609 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1612 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1614 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1615 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1616 return [self.playlist_result(url_entries, channel_id)]
1619 class YoutubeUserIE(InfoExtractor):
1620 """Information Extractor for YouTube users."""
1622 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1623 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1624 _GDATA_PAGE_SIZE = 50
1625 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1626 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1627 IE_NAME = u'youtube:user'
1629 def _real_extract(self, url):
1631 mobj = re.match(self._VALID_URL, url)
1633 raise ExtractorError(u'Invalid URL: %s' % url)
1635 username = mobj.group(1)
1637 # Download video ids using YouTube Data API. Result size per
1638 # query is limited (currently to 50 videos) so we need to query
1639 # page by page until there are no video ids - it means we got
1646 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1648 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1649 page = self._download_webpage(gdata_url, username,
1650 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1652 # Extract video identifiers
1655 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656 if mobj.group(1) not in ids_in_page:
1657 ids_in_page.append(mobj.group(1))
1659 video_ids.extend(ids_in_page)
1661 # A little optimization - if current page is not
1662 # "full", ie. does not contain PAGE_SIZE video ids then
1663 # we can assume that this page is the last one - there
1664 # are no more ids on further pages - no need to query
1667 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1672 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1673 url_results = [self.url_result(url, 'Youtube') for url in urls]
1674 return [self.playlist_result(url_results, playlist_title = username)]
1677 class BlipTVUserIE(InfoExtractor):
1678 """Information Extractor for blip.tv users."""
1680 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1682 IE_NAME = u'blip.tv:user'
1684 def _real_extract(self, url):
1686 mobj = re.match(self._VALID_URL, url)
1688 raise ExtractorError(u'Invalid URL: %s' % url)
1690 username = mobj.group(1)
1692 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1694 page = self._download_webpage(url, username, u'Downloading user page')
1695 mobj = re.search(r'data-users-id="([^"]+)"', page)
1696 page_base = page_base % mobj.group(1)
1699 # Download video ids using BlipTV Ajax calls. Result size per
1700 # query is limited (currently to 12 videos) so we need to query
1701 # page by page until there are no video ids - it means we got
1708 url = page_base + "&page=" + str(pagenum)
1709 page = self._download_webpage(url, username,
1710 u'Downloading video ids from page %d' % pagenum)
1712 # Extract video identifiers
1715 for mobj in re.finditer(r'href="/([^"]+)"', page):
1716 if mobj.group(1) not in ids_in_page:
1717 ids_in_page.append(unescapeHTML(mobj.group(1)))
1719 video_ids.extend(ids_in_page)
1721 # A little optimization - if current page is not
1722 # "full", ie. does not contain PAGE_SIZE video ids then
1723 # we can assume that this page is the last one - there
1724 # are no more ids on further pages - no need to query
1727 if len(ids_in_page) < self._PAGE_SIZE:
1732 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1733 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1734 return [self.playlist_result(url_entries, playlist_title = username)]
1737 class DepositFilesIE(InfoExtractor):
1738 """Information extractor for depositfiles.com"""
1740 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1742 def _real_extract(self, url):
1743 file_id = url.split('/')[-1]
1744 # Rebuild url in english locale
1745 url = 'http://depositfiles.com/en/files/' + file_id
1747 # Retrieve file webpage with 'Free download' button pressed
1748 free_download_indication = { 'gateway_result' : '1' }
1749 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1751 self.report_download_webpage(file_id)
1752 webpage = compat_urllib_request.urlopen(request).read()
1753 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1754 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1756 # Search for the real file URL
1757 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1758 if (mobj is None) or (mobj.group(1) is None):
1759 # Try to figure out reason of the error.
1760 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1761 if (mobj is not None) and (mobj.group(1) is not None):
1762 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1763 raise ExtractorError(u'%s' % restriction_message)
1765 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1767 file_url = mobj.group(1)
1768 file_extension = os.path.splitext(file_url)[1][1:]
1770 # Search for file title
1771 mobj = re.search(r'<b title="(.*?)">', webpage)
1773 raise ExtractorError(u'Unable to extract title')
1774 file_title = mobj.group(1).decode('utf-8')
1777 'id': file_id.decode('utf-8'),
1778 'url': file_url.decode('utf-8'),
1780 'upload_date': None,
1781 'title': file_title,
1782 'ext': file_extension.decode('utf-8'),
1786 class FacebookIE(InfoExtractor):
1787 """Information Extractor for Facebook"""
1789 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791 _NETRC_MACHINE = 'facebook'
1792 IE_NAME = u'facebook'
1794 def report_login(self):
1795 """Report attempt to log in."""
1796 self.to_screen(u'Logging in')
1798 def _real_initialize(self):
1799 if self._downloader is None:
1804 downloader_params = self._downloader.params
1806 # Attempt to use provided username and password or .netrc data
1807 if downloader_params.get('username', None) is not None:
1808 useremail = downloader_params['username']
1809 password = downloader_params['password']
1810 elif downloader_params.get('usenetrc', False):
1812 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1813 if info is not None:
1817 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1818 except (IOError, netrc.NetrcParseError) as err:
1819 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1822 if useremail is None:
1831 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1834 login_results = compat_urllib_request.urlopen(request).read()
1835 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1836 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1839 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1842 def _real_extract(self, url):
1843 mobj = re.match(self._VALID_URL, url)
1845 raise ExtractorError(u'Invalid URL: %s' % url)
1846 video_id = mobj.group('ID')
1848 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1849 webpage = self._download_webpage(url, video_id)
1851 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1852 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1853 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1855 raise ExtractorError(u'Cannot parse data')
1856 data = dict(json.loads(m.group(1)))
1857 params_raw = compat_urllib_parse.unquote(data['params'])
1858 params = json.loads(params_raw)
1859 video_data = params['video_data'][0]
1860 video_url = video_data.get('hd_src')
1862 video_url = video_data['sd_src']
1864 raise ExtractorError(u'Cannot find video URL')
1865 video_duration = int(video_data['video_duration'])
1866 thumbnail = video_data['thumbnail_src']
1868 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1870 raise ExtractorError(u'Cannot find title in webpage')
1871 video_title = unescapeHTML(m.group(1))
1875 'title': video_title,
1878 'duration': video_duration,
1879 'thumbnail': thumbnail,
1884 class BlipTVIE(InfoExtractor):
1885 """Information extractor for blip.tv"""
1887 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1888 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1889 IE_NAME = u'blip.tv'
1891 def report_direct_download(self, title):
1892 """Report information extraction."""
1893 self.to_screen(u'%s: Direct download detected' % title)
1895 def _real_extract(self, url):
1896 mobj = re.match(self._VALID_URL, url)
1898 raise ExtractorError(u'Invalid URL: %s' % url)
1900 # See https://github.com/rg3/youtube-dl/issues/857
1901 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1902 if api_mobj is not None:
1903 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1904 urlp = compat_urllib_parse_urlparse(url)
1905 if urlp.path.startswith('/play/'):
1906 request = compat_urllib_request.Request(url)
1907 response = compat_urllib_request.urlopen(request)
1908 redirecturl = response.geturl()
1909 rurlp = compat_urllib_parse_urlparse(redirecturl)
1910 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1911 url = 'http://blip.tv/a/a-' + file_id
1912 return self._real_extract(url)
1919 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1920 request = compat_urllib_request.Request(json_url)
1921 request.add_header('User-Agent', 'iTunes/10.6.1')
1922 self.report_extraction(mobj.group(1))
1925 urlh = compat_urllib_request.urlopen(request)
1926 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1927 basename = url.split('/')[-1]
1928 title,ext = os.path.splitext(basename)
1929 title = title.decode('UTF-8')
1930 ext = ext.replace('.', '')
1931 self.report_direct_download(title)
1936 'upload_date': None,
1941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1942 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1943 if info is None: # Regular URL
1945 json_code_bytes = urlh.read()
1946 json_code = json_code_bytes.decode('utf-8')
1947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1951 json_data = json.loads(json_code)
1952 if 'Post' in json_data:
1953 data = json_data['Post']
1957 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1958 video_url = data['media']['url']
1959 umobj = re.match(self._URL_EXT, video_url)
1961 raise ValueError('Can not determine filename extension')
1962 ext = umobj.group(1)
1965 'id': data['item_id'],
1967 'uploader': data['display_name'],
1968 'upload_date': upload_date,
1969 'title': data['title'],
1971 'format': data['media']['mimeType'],
1972 'thumbnail': data['thumbnailUrl'],
1973 'description': data['description'],
1974 'player_url': data['embedUrl'],
1975 'user_agent': 'iTunes/10.6.1',
1977 except (ValueError,KeyError) as err:
1978 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1983 class MyVideoIE(InfoExtractor):
1984 """Information Extractor for myvideo.de."""
1986 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1987 IE_NAME = u'myvideo'
1989 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1990 # Released into the Public Domain by Tristan Fischer on 2013-05-19
1991 # https://github.com/rg3/youtube-dl/pull/842
1992 def __rc4crypt(self,data, key):
1994 box = list(range(256))
1995 for i in list(range(256)):
1996 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1997 box[i], box[x] = box[x], box[i]
2003 y = (y + box[x]) % 256
2004 box[x], box[y] = box[y], box[x]
2005 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2009 return hashlib.md5(s).hexdigest().encode()
2011 def _real_extract(self,url):
2012 mobj = re.match(self._VALID_URL, url)
2014 raise ExtractorError(u'invalid URL: %s' % url)
2016 video_id = mobj.group(1)
2019 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2020 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2021 b'TnpsbA0KTVRkbU1tSTRNdz09'
2025 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2026 webpage = self._download_webpage(webpage_url, video_id)
2028 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2029 if mobj is not None:
2030 self.report_extraction(video_id)
2031 video_url = mobj.group(1) + '.flv'
2033 mobj = re.search('<title>([^<]+)</title>', webpage)
2035 raise ExtractorError(u'Unable to extract title')
2036 video_title = mobj.group(1)
2038 mobj = re.search('[.](.+?)$', video_url)
2040 raise ExtractorError(u'Unable to extract extention')
2041 video_ext = mobj.group(1)
2047 'upload_date': None,
2048 'title': video_title,
2053 mobj = re.search('var flashvars={(.+?)}', webpage)
2055 raise ExtractorError(u'Unable to extract video')
2060 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2061 if not a == '_encxml':
2064 encxml = compat_urllib_parse.unquote(b)
2065 if not params.get('domain'):
2066 params['domain'] = 'www.myvideo.de'
2067 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2068 if 'flash_playertype=MTV' in xmldata_url:
2069 self._downloader.report_warning(u'avoiding MTV player')
2071 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2072 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2076 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2077 enc_data_b = binascii.unhexlify(enc_data)
2079 base64.b64decode(base64.b64decode(GK)) +
2081 str(video_id).encode('utf-8')
2084 dec_data = self.__rc4crypt(enc_data_b, sk)
2087 self.report_extraction(video_id)
2089 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2091 raise ExtractorError(u'unable to extract rtmpurl')
2092 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2093 if 'myvideo2flash' in video_rtmpurl:
2094 self._downloader.report_warning(u'forcing RTMPT ...')
2095 video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2097 # extract non rtmp videos
2098 if (video_rtmpurl is None) or (video_rtmpurl == ''):
2099 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2101 raise ExtractorError(u'unable to extract url')
2102 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2104 mobj = re.search('source=\'(.*?)\'', dec_data)
2106 raise ExtractorError(u'unable to extract swfobj')
2107 video_file = compat_urllib_parse.unquote(mobj.group(1))
2109 if not video_file.endswith('f4m'):
2110 ppath, prefix = video_file.split('.')
2111 video_playpath = '%s:%s' % (prefix, ppath)
2112 video_hls_playlist = ''
2115 video_hls_playlist = (
2116 video_filepath + video_file
2117 ).replace('.f4m', '.m3u8')
2119 mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2121 raise ExtractorError(u'unable to extract swfobj')
2122 video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2124 mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2126 raise ExtractorError(u'unable to extract title')
2127 video_title = mobj.group(1)
2131 'url': video_rtmpurl,
2132 'tc_url': video_rtmpurl,
2134 'upload_date': None,
2135 'title': video_title,
2137 'play_path': video_playpath,
2138 'video_file': video_file,
2139 'video_hls_playlist': video_hls_playlist,
2140 'player_url': video_swfobj,
2143 class ComedyCentralIE(InfoExtractor):
2144 """Information extractor for The Daily Show and Colbert Report """
2146 # urls can be abbreviations like :thedailyshow or :colbert
2147 # urls for episodes like:
2148 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2149 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2150 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2151 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2152 |(https?://)?(www\.)?
2153 (?P<showname>thedailyshow|colbertnation)\.com/
2154 (full-episodes/(?P<episode>.*)|
2156 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2157 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2160 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2162 _video_extensions = {
2170 _video_dimensions = {
2180 def suitable(cls, url):
2181 """Receives a URL and returns True if suitable for this IE."""
2182 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2184 def _print_formats(self, formats):
2185 print('Available formats:')
2187 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2190 def _real_extract(self, url):
2191 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2193 raise ExtractorError(u'Invalid URL: %s' % url)
2195 if mobj.group('shortname'):
2196 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2197 url = u'http://www.thedailyshow.com/full-episodes/'
2199 url = u'http://www.colbertnation.com/full-episodes/'
2200 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2201 assert mobj is not None
2203 if mobj.group('clip'):
2204 if mobj.group('showname') == 'thedailyshow':
2205 epTitle = mobj.group('tdstitle')
2207 epTitle = mobj.group('cntitle')
2210 dlNewest = not mobj.group('episode')
2212 epTitle = mobj.group('showname')
2214 epTitle = mobj.group('episode')
2216 self.report_extraction(epTitle)
2217 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2219 url = htmlHandle.geturl()
2220 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2222 raise ExtractorError(u'Invalid redirected URL: ' + url)
2223 if mobj.group('episode') == '':
2224 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2225 epTitle = mobj.group('episode')
2227 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2229 if len(mMovieParams) == 0:
2230 # The Colbert Report embeds the information in a without
2231 # a URL prefix; so extract the alternate reference
2232 # and then add the URL prefix manually.
2234 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2235 if len(altMovieParams) == 0:
2236 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2238 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2240 uri = mMovieParams[0][1]
2241 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2242 indexXml = self._download_webpage(indexUrl, epTitle,
2243 u'Downloading show index',
2244 u'unable to download episode index')
2248 idoc = xml.etree.ElementTree.fromstring(indexXml)
2249 itemEls = idoc.findall('.//item')
2250 for partNum,itemEl in enumerate(itemEls):
2251 mediaId = itemEl.findall('./guid')[0].text
2252 shortMediaId = mediaId.split(':')[-1]
2253 showId = mediaId.split(':')[-2].replace('.com', '')
2254 officialTitle = itemEl.findall('./title')[0].text
2255 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2257 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2258 compat_urllib_parse.urlencode({'uri': mediaId}))
2259 configXml = self._download_webpage(configUrl, epTitle,
2260 u'Downloading configuration for %s' % shortMediaId)
2262 cdoc = xml.etree.ElementTree.fromstring(configXml)
2264 for rendition in cdoc.findall('.//rendition'):
2265 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2269 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2272 if self._downloader.params.get('listformats', None):
2273 self._print_formats([i[0] for i in turls])
2276 # For now, just pick the highest bitrate
2277 format,rtmp_video_url = turls[-1]
2279 # Get the format arg from the arg stream
2280 req_format = self._downloader.params.get('format', None)
2282 # Select format if we can find one
2285 format, rtmp_video_url = f, v
2288 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2290 raise ExtractorError(u'Cannot transform RTMP url')
2291 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2292 video_url = base + m.group('finalid')
2294 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2299 'upload_date': officialDate,
2304 'description': officialTitle,
2306 results.append(info)
2311 class EscapistIE(InfoExtractor):
2312 """Information extractor for The Escapist """
2314 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2315 IE_NAME = u'escapist'
2317 def _real_extract(self, url):
2318 mobj = re.match(self._VALID_URL, url)
2320 raise ExtractorError(u'Invalid URL: %s' % url)
2321 showName = mobj.group('showname')
2322 videoId = mobj.group('episode')
2324 self.report_extraction(showName)
2325 webPage = self._download_webpage(url, showName)
2327 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2328 description = unescapeHTML(descMatch.group(1))
2329 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2330 imgUrl = unescapeHTML(imgMatch.group(1))
2331 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2332 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2333 configUrlMatch = re.search('config=(.*)$', playerUrl)
2334 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2336 configJSON = self._download_webpage(configUrl, showName,
2337 u'Downloading configuration',
2338 u'unable to download configuration')
2340 # Technically, it's JavaScript, not JSON
2341 configJSON = configJSON.replace("'", '"')
2344 config = json.loads(configJSON)
2345 except (ValueError,) as err:
2346 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2348 playlist = config['playlist']
2349 videoUrl = playlist[1]['url']
2354 'uploader': showName,
2355 'upload_date': None,
2358 'thumbnail': imgUrl,
2359 'description': description,
2360 'player_url': playerUrl,
2365 class CollegeHumorIE(InfoExtractor):
2366 """Information extractor for collegehumor.com"""
2369 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2370 IE_NAME = u'collegehumor'
2372 def report_manifest(self, video_id):
2373 """Report information extraction."""
2374 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2376 def _real_extract(self, url):
2377 mobj = re.match(self._VALID_URL, url)
2379 raise ExtractorError(u'Invalid URL: %s' % url)
2380 video_id = mobj.group('videoid')
2385 'upload_date': None,
2388 self.report_extraction(video_id)
2389 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2391 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2392 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2393 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2395 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2397 videoNode = mdoc.findall('./video')[0]
2398 info['description'] = videoNode.findall('./description')[0].text
2399 info['title'] = videoNode.findall('./caption')[0].text
2400 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2401 manifest_url = videoNode.findall('./file')[0].text
2403 raise ExtractorError(u'Invalid metadata XML file')
2405 manifest_url += '?hdcore=2.10.3'
2406 self.report_manifest(video_id)
2408 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2410 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2412 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2414 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2415 node_id = media_node.attrib['url']
2416 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2417 except IndexError as err:
2418 raise ExtractorError(u'Invalid manifest file')
2420 url_pr = compat_urllib_parse_urlparse(manifest_url)
2421 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2428 class XVideosIE(InfoExtractor):
2429 """Information extractor for xvideos.com"""
2431 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2432 IE_NAME = u'xvideos'
2434 def _real_extract(self, url):
2435 mobj = re.match(self._VALID_URL, url)
2437 raise ExtractorError(u'Invalid URL: %s' % url)
2438 video_id = mobj.group(1)
2440 webpage = self._download_webpage(url, video_id)
2442 self.report_extraction(video_id)
2446 mobj = re.search(r'flv_url=(.+?)&', webpage)
2448 raise ExtractorError(u'Unable to extract video url')
2449 video_url = compat_urllib_parse.unquote(mobj.group(1))
2453 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2455 raise ExtractorError(u'Unable to extract video title')
2456 video_title = mobj.group(1)
2459 # Extract video thumbnail
2460 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2462 raise ExtractorError(u'Unable to extract video thumbnail')
2463 video_thumbnail = mobj.group(0)
2469 'upload_date': None,
2470 'title': video_title,
2472 'thumbnail': video_thumbnail,
2473 'description': None,
2479 class SoundcloudIE(InfoExtractor):
2480 """Information extractor for soundcloud.com
2481 To access the media, the uid of the song and a stream token
2482 must be extracted from the page source and the script must make
2483 a request to media.soundcloud.com/crossdomain.xml. Then
2484 the media can be grabbed by requesting from an url composed
2485 of the stream token and uid
2488 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2489 IE_NAME = u'soundcloud'
2491 def report_resolve(self, video_id):
2492 """Report information extraction."""
2493 self.to_screen(u'%s: Resolving id' % video_id)
2495 def _real_extract(self, url):
2496 mobj = re.match(self._VALID_URL, url)
2498 raise ExtractorError(u'Invalid URL: %s' % url)
2500 # extract uploader (which is in the url)
2501 uploader = mobj.group(1)
2502 # extract simple title (uploader + slug of song title)
2503 slug_title = mobj.group(2)
2504 simple_title = uploader + u'-' + slug_title
2505 full_title = '%s/%s' % (uploader, slug_title)
2507 self.report_resolve(full_title)
2509 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2510 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2511 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2513 info = json.loads(info_json)
2514 video_id = info['id']
2515 self.report_extraction(full_title)
2517 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2518 stream_json = self._download_webpage(streams_url, full_title,
2519 u'Downloading stream definitions',
2520 u'unable to download stream definitions')
2522 streams = json.loads(stream_json)
2523 mediaURL = streams['http_mp3_128_url']
2524 upload_date = unified_strdate(info['created_at'])
2529 'uploader': info['user']['username'],
2530 'upload_date': upload_date,
2531 'title': info['title'],
2533 'description': info['description'],
2536 class SoundcloudSetIE(InfoExtractor):
2537 """Information extractor for soundcloud.com sets
2538 To access the media, the uid of the song and a stream token
2539 must be extracted from the page source and the script must make
2540 a request to media.soundcloud.com/crossdomain.xml. Then
2541 the media can be grabbed by requesting from an url composed
2542 of the stream token and uid
2545 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2546 IE_NAME = u'soundcloud:set'
2548 def report_resolve(self, video_id):
2549 """Report information extraction."""
2550 self.to_screen(u'%s: Resolving id' % video_id)
2552 def _real_extract(self, url):
2553 mobj = re.match(self._VALID_URL, url)
2555 raise ExtractorError(u'Invalid URL: %s' % url)
2557 # extract uploader (which is in the url)
2558 uploader = mobj.group(1)
2559 # extract simple title (uploader + slug of song title)
2560 slug_title = mobj.group(2)
2561 simple_title = uploader + u'-' + slug_title
2562 full_title = '%s/sets/%s' % (uploader, slug_title)
2564 self.report_resolve(full_title)
2566 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2567 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2568 info_json = self._download_webpage(resolv_url, full_title)
2571 info = json.loads(info_json)
2572 if 'errors' in info:
2573 for err in info['errors']:
2574 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2577 self.report_extraction(full_title)
2578 for track in info['tracks']:
2579 video_id = track['id']
2581 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2582 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2584 self.report_extraction(video_id)
2585 streams = json.loads(stream_json)
2586 mediaURL = streams['http_mp3_128_url']
2591 'uploader': track['user']['username'],
2592 'upload_date': unified_strdate(track['created_at']),
2593 'title': track['title'],
2595 'description': track['description'],
2600 class InfoQIE(InfoExtractor):
2601 """Information extractor for infoq.com"""
2602 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2604 def _real_extract(self, url):
2605 mobj = re.match(self._VALID_URL, url)
2607 raise ExtractorError(u'Invalid URL: %s' % url)
2609 webpage = self._download_webpage(url, video_id=url)
2610 self.report_extraction(url)
2613 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2615 raise ExtractorError(u'Unable to extract video url')
2616 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2617 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2620 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2622 raise ExtractorError(u'Unable to extract video title')
2623 video_title = mobj.group(1)
2625 # Extract description
2626 video_description = u'No description available.'
2627 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2628 if mobj is not None:
2629 video_description = mobj.group(1)
2631 video_filename = video_url.split('/')[-1]
2632 video_id, extension = video_filename.split('.')
2638 'upload_date': None,
2639 'title': video_title,
2640 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2642 'description': video_description,
2647 class MixcloudIE(InfoExtractor):
2648 """Information extractor for www.mixcloud.com"""
2650 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2651 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2652 IE_NAME = u'mixcloud'
2654 def report_download_json(self, file_id):
2655 """Report JSON download."""
2656 self.to_screen(u'Downloading json')
2658 def get_urls(self, jsonData, fmt, bitrate='best'):
2659 """Get urls from 'audio_formats' section in json"""
2662 bitrate_list = jsonData[fmt]
2663 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2664 bitrate = max(bitrate_list) # select highest
2666 url_list = jsonData[fmt][bitrate]
2667 except TypeError: # we have no bitrate info.
2668 url_list = jsonData[fmt]
2671 def check_urls(self, url_list):
2672 """Returns 1st active url from list"""
2673 for url in url_list:
2675 compat_urllib_request.urlopen(url)
2677 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2682 def _print_formats(self, formats):
2683 print('Available formats:')
2684 for fmt in formats.keys():
2685 for b in formats[fmt]:
2687 ext = formats[fmt][b][0]
2688 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2689 except TypeError: # we have no bitrate info
2690 ext = formats[fmt][0]
2691 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2694 def _real_extract(self, url):
2695 mobj = re.match(self._VALID_URL, url)
2697 raise ExtractorError(u'Invalid URL: %s' % url)
2698 # extract uploader & filename from url
2699 uploader = mobj.group(1).decode('utf-8')
2700 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2702 # construct API request
2703 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2704 # retrieve .json file with links to files
2705 request = compat_urllib_request.Request(file_url)
2707 self.report_download_json(file_url)
2708 jsonData = compat_urllib_request.urlopen(request).read()
2709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2713 json_data = json.loads(jsonData)
2714 player_url = json_data['player_swf_url']
2715 formats = dict(json_data['audio_formats'])
2717 req_format = self._downloader.params.get('format', None)
2720 if self._downloader.params.get('listformats', None):
2721 self._print_formats(formats)
2724 if req_format is None or req_format == 'best':
2725 for format_param in formats.keys():
2726 url_list = self.get_urls(formats, format_param)
2728 file_url = self.check_urls(url_list)
2729 if file_url is not None:
2732 if req_format not in formats:
2733 raise ExtractorError(u'Format is not available')
2735 url_list = self.get_urls(formats, req_format)
2736 file_url = self.check_urls(url_list)
2737 format_param = req_format
2740 'id': file_id.decode('utf-8'),
2741 'url': file_url.decode('utf-8'),
2742 'uploader': uploader.decode('utf-8'),
2743 'upload_date': None,
2744 'title': json_data['name'],
2745 'ext': file_url.split('.')[-1].decode('utf-8'),
2746 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2747 'thumbnail': json_data['thumbnail_url'],
2748 'description': json_data['description'],
2749 'player_url': player_url.decode('utf-8'),
2752 class StanfordOpenClassroomIE(InfoExtractor):
2753 """Information extractor for Stanford's Open ClassRoom"""
2755 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2756 IE_NAME = u'stanfordoc'
2758 def _real_extract(self, url):
2759 mobj = re.match(self._VALID_URL, url)
2761 raise ExtractorError(u'Invalid URL: %s' % url)
2763 if mobj.group('course') and mobj.group('video'): # A specific video
2764 course = mobj.group('course')
2765 video = mobj.group('video')
2767 'id': course + '_' + video,
2769 'upload_date': None,
2772 self.report_extraction(info['id'])
2773 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2774 xmlUrl = baseUrl + video + '.xml'
2776 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2778 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2779 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2781 info['title'] = mdoc.findall('./title')[0].text
2782 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2784 raise ExtractorError(u'Invalid metadata XML file')
2785 info['ext'] = info['url'].rpartition('.')[2]
2787 elif mobj.group('course'): # A course page
2788 course = mobj.group('course')
2793 'upload_date': None,
2796 coursepage = self._download_webpage(url, info['id'],
2797 note='Downloading course info page',
2798 errnote='Unable to download course info page')
2800 m = re.search('<h1>([^<]+)</h1>', coursepage)
2802 info['title'] = unescapeHTML(m.group(1))
2804 info['title'] = info['id']
2806 m = re.search('<description>([^<]+)</description>', coursepage)
2808 info['description'] = unescapeHTML(m.group(1))
2810 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2813 'type': 'reference',
2814 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2818 for entry in info['list']:
2819 assert entry['type'] == 'reference'
2820 results += self.extract(entry['url'])
2824 'id': 'Stanford OpenClassroom',
2827 'upload_date': None,
2830 self.report_download_webpage(info['id'])
2831 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2833 rootpage = compat_urllib_request.urlopen(rootURL).read()
2834 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2835 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2837 info['title'] = info['id']
2839 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2842 'type': 'reference',
2843 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2848 for entry in info['list']:
2849 assert entry['type'] == 'reference'
2850 results += self.extract(entry['url'])
2853 class MTVIE(InfoExtractor):
2854 """Information extractor for MTV.com"""
2856 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2859 def _real_extract(self, url):
2860 mobj = re.match(self._VALID_URL, url)
2862 raise ExtractorError(u'Invalid URL: %s' % url)
2863 if not mobj.group('proto'):
2864 url = 'http://' + url
2865 video_id = mobj.group('videoid')
2867 webpage = self._download_webpage(url, video_id)
2869 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2871 raise ExtractorError(u'Unable to extract song name')
2872 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2873 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2875 raise ExtractorError(u'Unable to extract performer')
2876 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2877 video_title = performer + ' - ' + song_name
2879 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2881 raise ExtractorError(u'Unable to mtvn_uri')
2882 mtvn_uri = mobj.group(1)
2884 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2886 raise ExtractorError(u'Unable to extract content id')
2887 content_id = mobj.group(1)
2889 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2890 self.report_extraction(video_id)
2891 request = compat_urllib_request.Request(videogen_url)
2893 metadataXml = compat_urllib_request.urlopen(request).read()
2894 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2897 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2898 renditions = mdoc.findall('.//rendition')
2900 # For now, always pick the highest quality.
2901 rendition = renditions[-1]
2904 _,_,ext = rendition.attrib['type'].partition('/')
2905 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2906 video_url = rendition.find('./src').text
2908 raise ExtractorError('Invalid rendition field.')
2913 'uploader': performer,
2914 'upload_date': None,
2915 'title': video_title,
2923 class YoukuIE(InfoExtractor):
2924 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2927 nowTime = int(time.time() * 1000)
2928 random1 = random.randint(1000,1998)
2929 random2 = random.randint(1000,9999)
2931 return "%d%d%d" %(nowTime,random1,random2)
2933 def _get_file_ID_mix_string(self, seed):
2935 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2937 for i in range(len(source)):
2938 seed = (seed * 211 + 30031 ) % 65536
2939 index = math.floor(seed / 65536 * len(source) )
2940 mixed.append(source[int(index)])
2941 source.remove(source[int(index)])
2942 #return ''.join(mixed)
2945 def _get_file_id(self, fileId, seed):
2946 mixed = self._get_file_ID_mix_string(seed)
2947 ids = fileId.split('*')
2951 realId.append(mixed[int(ch)])
2952 return ''.join(realId)
2954 def _real_extract(self, url):
2955 mobj = re.match(self._VALID_URL, url)
2957 raise ExtractorError(u'Invalid URL: %s' % url)
2958 video_id = mobj.group('ID')
2960 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2962 jsondata = self._download_webpage(info_url, video_id)
2964 self.report_extraction(video_id)
2966 config = json.loads(jsondata)
2968 video_title = config['data'][0]['title']
2969 seed = config['data'][0]['seed']
2971 format = self._downloader.params.get('format', None)
2972 supported_format = list(config['data'][0]['streamfileids'].keys())
2974 if format is None or format == 'best':
2975 if 'hd2' in supported_format:
2980 elif format == 'worst':
2988 fileid = config['data'][0]['streamfileids'][format]
2989 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2990 except (UnicodeDecodeError, ValueError, KeyError):
2991 raise ExtractorError(u'Unable to extract info section')
2994 sid = self._gen_sid()
2995 fileid = self._get_file_id(fileid, seed)
2997 #column 8,9 of fileid represent the segment number
2998 #fileid[7:9] should be changed
2999 for index, key in enumerate(keys):
3001 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3002 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3005 'id': '%s_part%02d' % (video_id, index),
3006 'url': download_url,
3008 'upload_date': None,
3009 'title': video_title,
3012 files_info.append(info)
3017 class XNXXIE(InfoExtractor):
3018 """Information extractor for xnxx.com"""
3020 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3022 VIDEO_URL_RE = r'flv_url=(.*?)&'
3023 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3024 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3026 def _real_extract(self, url):
3027 mobj = re.match(self._VALID_URL, url)
3029 raise ExtractorError(u'Invalid URL: %s' % url)
3030 video_id = mobj.group(1)
3032 # Get webpage content
3033 webpage = self._download_webpage(url, video_id)
3035 result = re.search(self.VIDEO_URL_RE, webpage)
3037 raise ExtractorError(u'Unable to extract video url')
3038 video_url = compat_urllib_parse.unquote(result.group(1))
3040 result = re.search(self.VIDEO_TITLE_RE, webpage)
3042 raise ExtractorError(u'Unable to extract video title')
3043 video_title = result.group(1)
3045 result = re.search(self.VIDEO_THUMB_RE, webpage)
3047 raise ExtractorError(u'Unable to extract video thumbnail')
3048 video_thumbnail = result.group(1)
3054 'upload_date': None,
3055 'title': video_title,
3057 'thumbnail': video_thumbnail,
3058 'description': None,
3062 class GooglePlusIE(InfoExtractor):
3063 """Information extractor for plus.google.com."""
3065 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3066 IE_NAME = u'plus.google'
3068 def report_extract_entry(self, url):
3069 """Report downloading extry"""
3070 self.to_screen(u'Downloading entry: %s' % url)
3072 def report_date(self, upload_date):
3073 """Report downloading extry"""
3074 self.to_screen(u'Entry date: %s' % upload_date)
3076 def report_uploader(self, uploader):
3077 """Report downloading extry"""
3078 self.to_screen(u'Uploader: %s' % uploader)
3080 def report_title(self, video_title):
3081 """Report downloading extry"""
3082 self.to_screen(u'Title: %s' % video_title)
3084 def report_extract_vid_page(self, video_page):
3085 """Report information extraction."""
3086 self.to_screen(u'Extracting video page: %s' % video_page)
3088 def _real_extract(self, url):
3089 # Extract id from URL
3090 mobj = re.match(self._VALID_URL, url)
3092 raise ExtractorError(u'Invalid URL: %s' % url)
3094 post_url = mobj.group(0)
3095 video_id = mobj.group(1)
3097 video_extension = 'flv'
3099 # Step 1, Retrieve post webpage to extract further information
3100 self.report_extract_entry(post_url)
3101 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3103 # Extract update date
3105 pattern = 'title="Timestamp">(.*?)</a>'
3106 mobj = re.search(pattern, webpage)
3108 upload_date = mobj.group(1)
3109 # Convert timestring to a format suitable for filename
3110 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3111 upload_date = upload_date.strftime('%Y%m%d')
3112 self.report_date(upload_date)
3116 pattern = r'rel\="author".*?>(.*?)</a>'
3117 mobj = re.search(pattern, webpage)
3119 uploader = mobj.group(1)
3120 self.report_uploader(uploader)
3123 # Get the first line for title
3125 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3126 mobj = re.search(pattern, webpage)
3128 video_title = mobj.group(1)
3129 self.report_title(video_title)
3131 # Step 2, Stimulate clicking the image box to launch video
3132 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3133 mobj = re.search(pattern, webpage)
3135 raise ExtractorError(u'Unable to extract video page URL')
3137 video_page = mobj.group(1)
3138 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3139 self.report_extract_vid_page(video_page)
3142 # Extract video links on video page
3143 """Extract video links of all sizes"""
3144 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3145 mobj = re.findall(pattern, webpage)
3147 raise ExtractorError(u'Unable to extract video links')
3149 # Sort in resolution
3150 links = sorted(mobj)
3152 # Choose the lowest of the sort, i.e. highest resolution
3153 video_url = links[-1]
3154 # Only get the url. The resolution part in the tuple has no use anymore
3155 video_url = video_url[-1]
3156 # Treat escaped \u0026 style hex
3158 video_url = video_url.decode("unicode_escape")
3159 except AttributeError: # Python 3
3160 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3166 'uploader': uploader,
3167 'upload_date': upload_date,
3168 'title': video_title,
3169 'ext': video_extension,
3172 class NBAIE(InfoExtractor):
3173 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3176 def _real_extract(self, url):
3177 mobj = re.match(self._VALID_URL, url)
3179 raise ExtractorError(u'Invalid URL: %s' % url)
3181 video_id = mobj.group(1)
3182 if video_id.endswith('/index.html'):
3183 video_id = video_id[:-len('/index.html')]
3185 webpage = self._download_webpage(url, video_id)
3187 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3188 def _findProp(rexp, default=None):
3189 m = re.search(rexp, webpage)
3191 return unescapeHTML(m.group(1))
3195 shortened_video_id = video_id.rpartition('/')[2]
3196 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3198 'id': shortened_video_id,
3202 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3203 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3207 class JustinTVIE(InfoExtractor):
3208 """Information extractor for justin.tv and twitch.tv"""
3209 # TODO: One broadcast may be split into multiple videos. The key
3210 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3211 # starts at 1 and increases. Can we treat all parts as one video?
3213 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3215 (?P<channelid>[^/]+)|
3216 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3217 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3221 _JUSTIN_PAGE_LIMIT = 100
3222 IE_NAME = u'justin.tv'
3224 def report_download_page(self, channel, offset):
3225 """Report attempt to download a single page of videos."""
3226 self.to_screen(u'%s: Downloading video information from %d to %d' %
3227 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3229 # Return count of items, list of *valid* items
3230 def _parse_page(self, url, video_id):
3231 webpage = self._download_webpage(url, video_id,
3232 u'Downloading video info JSON',
3233 u'unable to download video info JSON')
3235 response = json.loads(webpage)
3236 if type(response) != list:
3237 error_text = response.get('error', 'unknown error')
3238 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3240 for clip in response:
3241 video_url = clip['video_file_url']
3243 video_extension = os.path.splitext(video_url)[1][1:]
3244 video_date = re.sub('-', '', clip['start_time'][:10])
3245 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3246 video_id = clip['id']
3247 video_title = clip.get('title', video_id)
3251 'title': video_title,
3252 'uploader': clip.get('channel_name', video_uploader_id),
3253 'uploader_id': video_uploader_id,
3254 'upload_date': video_date,
3255 'ext': video_extension,
3257 return (len(response), info)
3259 def _real_extract(self, url):
3260 mobj = re.match(self._VALID_URL, url)
3262 raise ExtractorError(u'invalid URL: %s' % url)
3264 api_base = 'http://api.justin.tv'
3266 if mobj.group('channelid'):
3268 video_id = mobj.group('channelid')
3269 api = api_base + '/channel/archives/%s.json' % video_id
3270 elif mobj.group('chapterid'):
3271 chapter_id = mobj.group('chapterid')
3273 webpage = self._download_webpage(url, chapter_id)
3274 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3276 raise ExtractorError(u'Cannot find archive of a chapter')
3277 archive_id = m.group(1)
3279 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3280 chapter_info_xml = self._download_webpage(api, chapter_id,
3281 note=u'Downloading chapter information',
3282 errnote=u'Chapter information download failed')
3283 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3284 for a in doc.findall('.//archive'):
3285 if archive_id == a.find('./id').text:
3288 raise ExtractorError(u'Could not find chapter in chapter information')
3290 video_url = a.find('./video_file_url').text
3291 video_ext = video_url.rpartition('.')[2] or u'flv'
3293 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3294 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3295 note='Downloading chapter metadata',
3296 errnote='Download of chapter metadata failed')
3297 chapter_info = json.loads(chapter_info_json)
3299 bracket_start = int(doc.find('.//bracket_start').text)
3300 bracket_end = int(doc.find('.//bracket_end').text)
3302 # TODO determine start (and probably fix up file)
3303 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3304 #video_url += u'?start=' + TODO:start_timestamp
3305 # bracket_start is 13290, but we want 51670615
3306 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3307 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3310 'id': u'c' + chapter_id,
3313 'title': chapter_info['title'],
3314 'thumbnail': chapter_info['preview'],
3315 'description': chapter_info['description'],
3316 'uploader': chapter_info['channel']['display_name'],
3317 'uploader_id': chapter_info['channel']['name'],
3321 video_id = mobj.group('videoid')
3322 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3324 self.report_extraction(video_id)
3328 limit = self._JUSTIN_PAGE_LIMIT
3331 self.report_download_page(video_id, offset)
3332 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3333 page_count, page_info = self._parse_page(page_url, video_id)
3334 info.extend(page_info)
3335 if not paged or page_count != limit:
3340 class FunnyOrDieIE(InfoExtractor):
3341 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3343 def _real_extract(self, url):
3344 mobj = re.match(self._VALID_URL, url)
3346 raise ExtractorError(u'invalid URL: %s' % url)
3348 video_id = mobj.group('id')
3349 webpage = self._download_webpage(url, video_id)
3351 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3353 raise ExtractorError(u'Unable to find video information')
3354 video_url = unescapeHTML(m.group('url'))
3356 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3358 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3360 raise ExtractorError(u'Cannot find video title')
3361 title = clean_html(m.group('title'))
3363 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3365 desc = unescapeHTML(m.group('desc'))
3374 'description': desc,
3378 class SteamIE(InfoExtractor):
3379 _VALID_URL = r"""http://store\.steampowered\.com/
3381 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3383 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3387 def suitable(cls, url):
3388 """Receives a URL and returns True if suitable for this IE."""
3389 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3391 def _real_extract(self, url):
3392 m = re.match(self._VALID_URL, url, re.VERBOSE)
3393 gameID = m.group('gameID')
3394 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3395 self.report_age_confirmation()
3396 webpage = self._download_webpage(videourl, gameID)
3397 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3399 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3400 mweb = re.finditer(urlRE, webpage)
3401 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3402 titles = re.finditer(namesRE, webpage)
3403 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3404 thumbs = re.finditer(thumbsRE, webpage)
3406 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3407 video_id = vid.group('videoID')
3408 title = vtitle.group('videoName')
3409 video_url = vid.group('videoURL')
3410 video_thumb = thumb.group('thumbnail')
3412 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3417 'title': unescapeHTML(title),
3418 'thumbnail': video_thumb
3421 return [self.playlist_result(videos, gameID, game_title)]
3423 class UstreamIE(InfoExtractor):
3424 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3425 IE_NAME = u'ustream'
3427 def _real_extract(self, url):
3428 m = re.match(self._VALID_URL, url)
3429 video_id = m.group('videoID')
3430 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3431 webpage = self._download_webpage(url, video_id)
3432 self.report_extraction(video_id)
3434 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3435 title = m.group('title')
3436 m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3438 uploader = unescapeHTML(m.group('uploader').strip())
3439 m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3440 thumb = m.group('thumb')
3441 except AttributeError:
3442 raise ExtractorError(u'Unable to extract info')
3448 'uploader': uploader,
3453 class WorldStarHipHopIE(InfoExtractor):
3454 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3455 IE_NAME = u'WorldStarHipHop'
3457 def _real_extract(self, url):
3458 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3460 m = re.match(self._VALID_URL, url)
3461 video_id = m.group('id')
3463 webpage_src = self._download_webpage(url, video_id)
3465 mobj = re.search(_src_url, webpage_src)
3467 if mobj is not None:
3468 video_url = mobj.group(1)
3469 if 'mp4' in video_url:
3474 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3476 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3479 raise ExtractorError(u'Cannot determine title')
3480 title = mobj.group(1)
3482 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3483 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3484 if mobj is not None:
3485 thumbnail = mobj.group(1)
3487 _title = r"""candytitles.*>(.*)</span>"""
3488 mobj = re.search(_title, webpage_src)
3489 if mobj is not None:
3490 title = mobj.group(1)
3497 'thumbnail' : thumbnail,
3502 class RBMARadioIE(InfoExtractor):
3503 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3505 def _real_extract(self, url):
3506 m = re.match(self._VALID_URL, url)
3507 video_id = m.group('videoID')
3509 webpage = self._download_webpage(url, video_id)
3510 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3512 raise ExtractorError(u'Cannot find metadata')
3513 json_data = m.group(1)
3516 data = json.loads(json_data)
3517 except ValueError as e:
3518 raise ExtractorError(u'Invalid JSON: ' + str(e))
3520 video_url = data['akamai_url'] + '&cbr=256'
3521 url_parts = compat_urllib_parse_urlparse(video_url)
3522 video_ext = url_parts.path.rpartition('.')[2]
3527 'title': data['title'],
3528 'description': data.get('teaser_text'),
3529 'location': data.get('country_of_origin'),
3530 'uploader': data.get('host', {}).get('name'),
3531 'uploader_id': data.get('host', {}).get('slug'),
3532 'thumbnail': data.get('image', {}).get('large_url_2x'),
3533 'duration': data.get('duration'),
3538 class YouPornIE(InfoExtractor):
3539 """Information extractor for youporn.com."""
3540 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3542 def _print_formats(self, formats):
3543 """Print all available formats"""
3544 print(u'Available formats:')
3545 print(u'ext\t\tformat')
3546 print(u'---------------------------------')
3547 for format in formats:
3548 print(u'%s\t\t%s' % (format['ext'], format['format']))
3550 def _specific(self, req_format, formats):
3552 if(x["format"]==req_format):
3556 def _real_extract(self, url):
3557 mobj = re.match(self._VALID_URL, url)
3559 raise ExtractorError(u'Invalid URL: %s' % url)
3561 video_id = mobj.group('videoid')
3563 req = compat_urllib_request.Request(url)
3564 req.add_header('Cookie', 'age_verified=1')
3565 webpage = self._download_webpage(req, video_id)
3567 # Get the video title
3568 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3570 raise ExtractorError(u'Unable to extract video title')
3571 video_title = result.group('title').strip()
3573 # Get the video date
3574 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3576 self._downloader.report_warning(u'unable to extract video date')
3579 upload_date = unified_strdate(result.group('date').strip())
3581 # Get the video uploader
3582 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3584 self._downloader.report_warning(u'unable to extract uploader')
3585 video_uploader = None
3587 video_uploader = result.group('uploader').strip()
3588 video_uploader = clean_html( video_uploader )
3590 # Get all of the formats available
3591 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3592 result = re.search(DOWNLOAD_LIST_RE, webpage)
3594 raise ExtractorError(u'Unable to extract download list')
3595 download_list_html = result.group('download_list').strip()
3597 # Get all of the links from the page
3598 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3599 links = re.findall(LINK_RE, download_list_html)
3600 if(len(links) == 0):
3601 raise ExtractorError(u'ERROR: no known formats available for video')
3603 self.to_screen(u'Links found: %d' % len(links))
3608 # A link looks like this:
3609 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3610 # A path looks like this:
3611 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3612 video_url = unescapeHTML( link )
3613 path = compat_urllib_parse_urlparse( video_url ).path
3614 extension = os.path.splitext( path )[1][1:]
3615 format = path.split('/')[4].split('_')[:2]
3618 format = "-".join( format )
3619 title = u'%s-%s-%s' % (video_title, size, bitrate)
3624 'uploader': video_uploader,
3625 'upload_date': upload_date,
3630 'description': None,
3634 if self._downloader.params.get('listformats', None):
3635 self._print_formats(formats)
3638 req_format = self._downloader.params.get('format', None)
3639 self.to_screen(u'Format: %s' % req_format)
3641 if req_format is None or req_format == 'best':
3643 elif req_format == 'worst':
3644 return [formats[-1]]
3645 elif req_format in ('-1', 'all'):
3648 format = self._specific( req_format, formats )
3650 raise ExtractorError(u'Requested format not available')
3655 class PornotubeIE(InfoExtractor):
3656 """Information extractor for pornotube.com."""
3657 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3659 def _real_extract(self, url):
3660 mobj = re.match(self._VALID_URL, url)
3662 raise ExtractorError(u'Invalid URL: %s' % url)
3664 video_id = mobj.group('videoid')
3665 video_title = mobj.group('title')
3667 # Get webpage content
3668 webpage = self._download_webpage(url, video_id)
3671 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3672 result = re.search(VIDEO_URL_RE, webpage)
3674 raise ExtractorError(u'Unable to extract video url')
3675 video_url = compat_urllib_parse.unquote(result.group('url'))
3677 #Get the uploaded date
3678 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3679 result = re.search(VIDEO_UPLOADED_RE, webpage)
3681 raise ExtractorError(u'Unable to extract video title')
3682 upload_date = unified_strdate(result.group('date'))
3684 info = {'id': video_id,
3687 'upload_date': upload_date,
3688 'title': video_title,
3694 class YouJizzIE(InfoExtractor):
3695 """Information extractor for youjizz.com."""
3696 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3698 def _real_extract(self, url):
3699 mobj = re.match(self._VALID_URL, url)
3701 raise ExtractorError(u'Invalid URL: %s' % url)
3703 video_id = mobj.group('videoid')
3705 # Get webpage content
3706 webpage = self._download_webpage(url, video_id)
3708 # Get the video title
3709 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3711 raise ExtractorError(u'ERROR: unable to extract video title')
3712 video_title = result.group('title').strip()
3714 # Get the embed page
3715 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3717 raise ExtractorError(u'ERROR: unable to extract embed page')
3719 embed_page_url = result.group(0).strip()
3720 video_id = result.group('videoid')
3722 webpage = self._download_webpage(embed_page_url, video_id)
3725 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3727 raise ExtractorError(u'ERROR: unable to extract video url')
3728 video_url = result.group('source')
3730 info = {'id': video_id,
3732 'title': video_title,
3735 'player_url': embed_page_url}
3739 class EightTracksIE(InfoExtractor):
3741 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3743 def _real_extract(self, url):
3744 mobj = re.match(self._VALID_URL, url)
3746 raise ExtractorError(u'Invalid URL: %s' % url)
3747 playlist_id = mobj.group('id')
3749 webpage = self._download_webpage(url, playlist_id)
3751 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3753 raise ExtractorError(u'Cannot find trax information')
3754 json_like = m.group(1)
3755 data = json.loads(json_like)
3757 session = str(random.randint(0, 1000000000))
3759 track_count = data['tracks_count']
3760 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3761 next_url = first_url
3763 for i in itertools.count():
3764 api_json = self._download_webpage(next_url, playlist_id,
3765 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3766 errnote=u'Failed to download song information')
3767 api_data = json.loads(api_json)
3768 track_data = api_data[u'set']['track']
3770 'id': track_data['id'],
3771 'url': track_data['track_file_stream_url'],
3772 'title': track_data['performer'] + u' - ' + track_data['name'],
3773 'raw_title': track_data['name'],
3774 'uploader_id': data['user']['login'],
3778 if api_data['set']['at_last_track']:
3780 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3783 class KeekIE(InfoExtractor):
3784 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3787 def _real_extract(self, url):
3788 m = re.match(self._VALID_URL, url)
3789 video_id = m.group('videoID')
3790 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3791 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3792 webpage = self._download_webpage(url, video_id)
3793 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3794 title = unescapeHTML(m.group('title'))
3795 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3796 uploader = clean_html(m.group('uploader'))
3802 'thumbnail': thumbnail,
3803 'uploader': uploader
3807 class TEDIE(InfoExtractor):
3808 _VALID_URL=r'''http://www\.ted\.com/
3810 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3812 ((?P<type_talk>talks)) # We have a simple talk
3814 (/lang/(.*?))? # The url may contain the language
3815 /(?P<name>\w+) # Here goes the name and then ".html"
3819 def suitable(cls, url):
3820 """Receives a URL and returns True if suitable for this IE."""
3821 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3823 def _real_extract(self, url):
3824 m=re.match(self._VALID_URL, url, re.VERBOSE)
3825 if m.group('type_talk'):
3826 return [self._talk_info(url)]
3828 playlist_id=m.group('playlist_id')
3829 name=m.group('name')
3830 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3831 return [self._playlist_videos_info(url,name,playlist_id)]
3833 def _talk_video_link(self,mediaSlug):
3834 '''Returns the video link for that mediaSlug'''
3835 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3837 def _playlist_videos_info(self,url,name,playlist_id=0):
3838 '''Returns the videos of the playlist'''
3840 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3841 ([.\s]*?)data-playlist_item_id="(\d+)"
3842 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3844 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3845 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3846 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3847 m_names=re.finditer(video_name_RE,webpage)
3849 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3850 m_playlist = re.search(playlist_RE, webpage)
3851 playlist_title = m_playlist.group('playlist_title')
3853 playlist_entries = []
3854 for m_video, m_name in zip(m_videos,m_names):
3855 video_id=m_video.group('video_id')
3856 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3857 playlist_entries.append(self.url_result(talk_url, 'TED'))
3858 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3860 def _talk_info(self, url, video_id=0):
3861 """Return the video for the talk in the url"""
3862 m=re.match(self._VALID_URL, url,re.VERBOSE)
3863 videoName=m.group('name')
3864 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3865 # If the url includes the language we get the title translated
3866 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3867 title=re.search(title_RE, webpage).group('title')
3868 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3869 "id":(?P<videoID>[\d]+).*?
3870 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3871 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3872 thumb_match=re.search(thumb_RE,webpage)
3873 info_match=re.search(info_RE,webpage,re.VERBOSE)
3874 video_id=info_match.group('videoID')
3875 mediaSlug=info_match.group('mediaSlug')
3876 video_url=self._talk_video_link(mediaSlug)
3882 'thumbnail': thumb_match.group('thumbnail')
3886 class MySpassIE(InfoExtractor):
3887 _VALID_URL = r'http://www.myspass.de/.*'
3889 def _real_extract(self, url):
3890 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3892 # video id is the last path element of the URL
3893 # usually there is a trailing slash, so also try the second but last
3894 url_path = compat_urllib_parse_urlparse(url).path
3895 url_parent_path, video_id = os.path.split(url_path)
3897 _, video_id = os.path.split(url_parent_path)
3900 metadata_url = META_DATA_URL_TEMPLATE % video_id
3901 metadata_text = self._download_webpage(metadata_url, video_id)
3902 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3904 # extract values from metadata
3905 url_flv_el = metadata.find('url_flv')
3906 if url_flv_el is None:
3907 raise ExtractorError(u'Unable to extract download url')
3908 video_url = url_flv_el.text
3909 extension = os.path.splitext(video_url)[1][1:]
3910 title_el = metadata.find('title')
3911 if title_el is None:
3912 raise ExtractorError(u'Unable to extract title')
3913 title = title_el.text
3914 format_id_el = metadata.find('format_id')
3915 if format_id_el is None:
3918 format = format_id_el.text
3919 description_el = metadata.find('description')
3920 if description_el is not None:
3921 description = description_el.text
3924 imagePreview_el = metadata.find('imagePreview')
3925 if imagePreview_el is not None:
3926 thumbnail = imagePreview_el.text
3935 'thumbnail': thumbnail,
3936 'description': description
3940 class SpiegelIE(InfoExtractor):
3941 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3943 def _real_extract(self, url):
3944 m = re.match(self._VALID_URL, url)
3945 video_id = m.group('videoID')
3947 webpage = self._download_webpage(url, video_id)
3948 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3950 raise ExtractorError(u'Cannot find title')
3951 video_title = unescapeHTML(m.group(1))
3953 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3954 xml_code = self._download_webpage(xml_url, video_id,
3955 note=u'Downloading XML', errnote=u'Failed to download XML')
3957 idoc = xml.etree.ElementTree.fromstring(xml_code)
3958 last_type = idoc[-1]
3959 filename = last_type.findall('./filename')[0].text
3960 duration = float(last_type.findall('./duration')[0].text)
3962 video_url = 'http://video2.spiegel.de/flash/' + filename
3963 video_ext = filename.rpartition('.')[2]
3968 'title': video_title,
3969 'duration': duration,
3973 class LiveLeakIE(InfoExtractor):
3975 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3976 IE_NAME = u'liveleak'
3978 def _real_extract(self, url):
3979 mobj = re.match(self._VALID_URL, url)
3981 raise ExtractorError(u'Invalid URL: %s' % url)
3983 video_id = mobj.group('video_id')
3985 webpage = self._download_webpage(url, video_id)
3987 m = re.search(r'file: "(.*?)",', webpage)
3989 raise ExtractorError(u'Unable to find video url')
3990 video_url = m.group(1)
3992 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3994 raise ExtractorError(u'Cannot find video title')
3995 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3997 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3999 desc = unescapeHTML(m.group('desc'))
4003 m = re.search(r'By:.*?(\w+)</a>', webpage)
4005 uploader = clean_html(m.group(1))
4014 'description': desc,
4015 'uploader': uploader
4020 class ARDIE(InfoExtractor):
4021 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4022 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4023 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4025 def _real_extract(self, url):
4026 # determine video id from url
4027 m = re.match(self._VALID_URL, url)
4029 numid = re.search(r'documentId=([0-9]+)', url)
4031 video_id = numid.group(1)
4033 video_id = m.group('video_id')
4035 # determine title and media streams from webpage
4036 html = self._download_webpage(url, video_id)
4037 title = re.search(self._TITLE, html).group('title')
4038 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4040 assert '"fsk"' in html
4041 raise ExtractorError(u'This video is only available after 8:00 pm')
4043 # choose default media type and highest quality for now
4044 stream = max([s for s in streams if int(s["media_type"]) == 0],
4045 key=lambda s: int(s["quality"]))
4047 # there's two possibilities: RTMP stream or HTTP download
4048 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4049 if stream['rtmp_url']:
4050 self.to_screen(u'RTMP download detected')
4051 assert stream['video_url'].startswith('mp4:')
4052 info["url"] = stream["rtmp_url"]
4053 info["play_path"] = stream['video_url']
4055 assert stream["video_url"].endswith('.mp4')
4056 info["url"] = stream["video_url"]
4059 class TumblrIE(InfoExtractor):
4060 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4062 def _real_extract(self, url):
4063 m_url = re.match(self._VALID_URL, url)
4064 video_id = m_url.group('id')
4065 blog = m_url.group('blog_name')
4067 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4068 webpage = self._download_webpage(url, video_id)
4070 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4071 video = re.search(re_video, webpage)
4073 self.to_screen("No video found")
4075 video_url = video.group('video_url')
4076 ext = video.group('ext')
4078 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4079 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4081 # The only place where you can get a title, it's not complete,
4082 # but searching in other places doesn't work for all videos
4083 re_title = r'<title>(?P<title>.*?)</title>'
4084 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4086 return [{'id': video_id,
4093 class BandcampIE(InfoExtractor):
4094 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4096 def _real_extract(self, url):
4097 mobj = re.match(self._VALID_URL, url)
4098 title = mobj.group('title')
4099 webpage = self._download_webpage(url, title)
4100 # We get the link to the free download page
4101 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4102 if m_download is None:
4103 raise ExtractorError(u'No free songs founded')
4105 download_link = m_download.group(1)
4106 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4107 webpage, re.MULTILINE|re.DOTALL).group('id')
4109 download_webpage = self._download_webpage(download_link, id,
4110 'Downloading free downloads page')
4111 # We get the dictionary of the track from some javascrip code
4112 info = re.search(r'items: (.*?),$',
4113 download_webpage, re.MULTILINE).group(1)
4114 info = json.loads(info)[0]
4115 # We pick mp3-320 for now, until format selection can be easily implemented.
4116 mp3_info = info[u'downloads'][u'mp3-320']
4117 # If we try to use this url it says the link has expired
4118 initial_url = mp3_info[u'url']
4119 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4120 m_url = re.match(re_url, initial_url)
4121 #We build the url we will use to get the final track url
4122 # This url is build in Bandcamp in the script download_bunde_*.js
4123 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4124 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4125 # If we could correctly generate the .rand field the url would be
4126 #in the "download_url" key
4127 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4129 track_info = {'id':id,
4130 'title' : info[u'title'],
4133 'thumbnail' : info[u'thumb_url'],
4134 'uploader' : info[u'artist']
4139 class RedTubeIE(InfoExtractor):
4140 """Information Extractor for redtube"""
4141 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4143 def _real_extract(self,url):
4144 mobj = re.match(self._VALID_URL, url)
4146 raise ExtractorError(u'Invalid URL: %s' % url)
4148 video_id = mobj.group('id')
4149 video_extension = 'mp4'
4150 webpage = self._download_webpage(url, video_id)
4151 self.report_extraction(video_id)
4152 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4155 raise ExtractorError(u'Unable to extract media URL')
4157 video_url = mobj.group(1)
4158 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4160 raise ExtractorError(u'Unable to extract title')
4161 video_title = mobj.group(1)
4166 'ext': video_extension,
4167 'title': video_title,
4170 class InaIE(InfoExtractor):
4171 """Information Extractor for Ina.fr"""
4172 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4174 def _real_extract(self,url):
4175 mobj = re.match(self._VALID_URL, url)
4177 video_id = mobj.group('id')
4178 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4179 video_extension = 'mp4'
4180 webpage = self._download_webpage(mrss_url, video_id)
4182 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4184 raise ExtractorError(u'Unable to extract media URL')
4185 video_url = mobj.group(1)
4187 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4189 raise ExtractorError(u'Unable to extract title')
4190 video_title = mobj.group(1)
4195 'ext': video_extension,
4196 'title': video_title,
4199 class HowcastIE(InfoExtractor):
4200 """Information Extractor for Howcast.com"""
4201 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4203 def _real_extract(self, url):
4204 mobj = re.match(self._VALID_URL, url)
4206 video_id = mobj.group('id')
4207 webpage_url = 'http://www.howcast.com/videos/' + video_id
4208 webpage = self._download_webpage(webpage_url, video_id)
4210 self.report_extraction(video_id)
4212 mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4214 raise ExtractorError(u'Unable to extract video URL')
4215 video_url = mobj.group(1)
4217 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4219 raise ExtractorError(u'Unable to extract title')
4220 video_title = mobj.group(1) or mobj.group(2)
4222 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4224 self._downloader.report_warning(u'unable to extract description')
4225 video_description = None
4227 video_description = mobj.group(1) or mobj.group(2)
4229 mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4231 raise ExtractorError(u'Unable to extract thumbnail')
4232 thumbnail = mobj.group(1)
4238 'title': video_title,
4239 'description': video_description,
4240 'thumbnail': thumbnail,
4243 class VineIE(InfoExtractor):
4244 """Information Extractor for Vine.co"""
4245 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4247 def _real_extract(self, url):
4249 mobj = re.match(self._VALID_URL, url)
4251 video_id = mobj.group('id')
4252 webpage_url = 'https://vine.co/v/' + video_id
4253 webpage = self._download_webpage(webpage_url, video_id)
4255 self.report_extraction(video_id)
4257 mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4259 raise ExtractorError(u'Unable to extract video URL')
4260 video_url = mobj.group(1)
4262 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4264 raise ExtractorError(u'Unable to extract title')
4265 video_title = mobj.group(1)
4267 mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4269 raise ExtractorError(u'Unable to extract thumbnail')
4270 thumbnail = mobj.group(1)
4272 mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4274 raise ExtractorError(u'Unable to extract uploader')
4275 uploader = mobj.group(1)
4281 'title': video_title,
4282 'thumbnail': thumbnail,
4283 'uploader': uploader,
4286 class FlickrIE(InfoExtractor):
4287 """Information Extractor for Flickr videos"""
4288 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4290 def _real_extract(self, url):
4291 mobj = re.match(self._VALID_URL, url)
4293 video_id = mobj.group('id')
4294 video_uploader_id = mobj.group('uploader_id')
4295 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4296 webpage = self._download_webpage(webpage_url, video_id)
4298 mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4300 raise ExtractorError(u'Unable to extract video secret')
4301 secret = mobj.group(1)
4303 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4304 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4306 mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4308 raise ExtractorError(u'Unable to extract node_id')
4309 node_id = mobj.group(1)
4311 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4312 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4314 self.report_extraction(video_id)
4316 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4318 raise ExtractorError(u'Unable to extract video url')
4319 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4321 mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4323 raise ExtractorError(u'Unable to extract title')
4324 video_title = mobj.group(1) or mobj.group(2)
4326 mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4328 self._downloader.report_warning(u'unable to extract description')
4329 video_description = None
4331 video_description = mobj.group(1) or mobj.group(2)
4333 mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4335 raise ExtractorError(u'Unable to extract thumbnail')
4336 thumbnail = mobj.group(1) or mobj.group(2)
4342 'title': video_title,
4343 'description': video_description,
4344 'thumbnail': thumbnail,
4345 'uploader_id': video_uploader_id,
4348 class TeamcocoIE(InfoExtractor):
4349 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4351 def _real_extract(self, url):
4352 mobj = re.match(self._VALID_URL, url)
4354 raise ExtractorError(u'Invalid URL: %s' % url)
4355 url_title = mobj.group('url_title')
4356 webpage = self._download_webpage(url, url_title)
4358 mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4359 video_id = mobj.group(1)
4361 self.report_extraction(video_id)
4363 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4365 raise ExtractorError(u'Unable to extract title')
4366 video_title = mobj.group(1)
4368 mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4370 raise ExtractorError(u'Unable to extract thumbnail')
4371 thumbnail = mobj.group(1)
4373 mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4375 raise ExtractorError(u'Unable to extract description')
4376 description = mobj.group(1)
4378 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4379 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4380 mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4382 raise ExtractorError(u'Unable to extract video url')
4383 video_url = mobj.group(1)
4389 'title': video_title,
4390 'thumbnail': thumbnail,
4391 'description': description,
4394 def gen_extractors():
4395 """ Return a list of an instance of every supported extractor.
4396 The order does matter; the first extractor matched is the one handling the URL.
4399 YoutubePlaylistIE(),
4424 StanfordOpenClassroomIE(),
4434 WorldStarHipHopIE(),
4458 def get_info_extractor(ie_name):
4459 """Returns the info extractor class with the given ie_name"""
4460 return globals()[ie_name+'IE']