2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
148 class YoutubeIE(InfoExtractor):
149 """Information extractor for youtube.com."""
153 (?:https?://)? # http(s):// (optional)
154 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
155 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
156 (?:.*?\#/)? # handle anchor (#/) redirect urls
157 (?: # the various things that can precede the ID:
158 (?:(?:v|embed|e)/) # v/ or embed/ or e/
159 |(?: # or the v= param in all its forms
160 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
161 (?:\?|\#!?) # the params delimiter ? or # or #!
162 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
165 )? # optional -> youtube.com/xxxx is OK
166 )? # all until now is optional -> you can pass the naked ID
167 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
168 (?(1).+)? # if we found the ID, everything can follow
170 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
171 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
172 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
173 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
174 _NETRC_MACHINE = 'youtube'
175 # Listed in order of quality
176 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
178 _video_extensions = {
184 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
190 _video_dimensions = {
209 def suitable(cls, url):
210 """Receives a URL and returns True if suitable for this IE."""
211 if YoutubePlaylistIE.suitable(url): return False
212 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
214 def report_lang(self):
215 """Report attempt to set language."""
216 self._downloader.to_screen(u'[youtube] Setting language')
218 def report_login(self):
219 """Report attempt to log in."""
220 self._downloader.to_screen(u'[youtube] Logging in')
222 def report_age_confirmation(self):
223 """Report attempt to confirm age."""
224 self._downloader.to_screen(u'[youtube] Confirming age')
226 def report_video_webpage_download(self, video_id):
227 """Report attempt to download video webpage."""
228 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
230 def report_video_info_webpage_download(self, video_id):
231 """Report attempt to download video info webpage."""
232 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
234 def report_video_subtitles_download(self, video_id):
235 """Report attempt to download video info webpage."""
236 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
238 def report_video_subtitles_request(self, video_id, sub_lang, format):
239 """Report attempt to download video info webpage."""
240 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
242 def report_video_subtitles_available(self, video_id, sub_lang_list):
243 """Report available subtitles."""
244 sub_lang = ",".join(list(sub_lang_list.keys()))
245 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
247 def report_information_extraction(self, video_id):
248 """Report attempt to extract video information."""
249 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
251 def report_unavailable_format(self, video_id, format):
252 """Report extracted video URL."""
253 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
255 def report_rtmp_download(self):
256 """Indicate the download will use the RTMP protocol."""
257 self._downloader.to_screen(u'[youtube] RTMP download detected')
259 def _get_available_subtitles(self, video_id):
260 self.report_video_subtitles_download(video_id)
261 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
263 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
265 return (u'unable to download video subtitles: %s' % compat_str(err), None)
266 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
267 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
268 if not sub_lang_list:
269 return (u'video doesn\'t have subtitles', None)
272 def _list_available_subtitles(self, video_id):
273 sub_lang_list = self._get_available_subtitles(video_id)
274 self.report_video_subtitles_available(video_id, sub_lang_list)
276 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
279 (error_message, sub_lang, sub)
281 self.report_video_subtitles_request(video_id, sub_lang, format)
282 params = compat_urllib_parse.urlencode({
288 url = 'http://www.youtube.com/api/timedtext?' + params
290 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
294 return (u'Did not fetch video subtitles', None, None)
295 return (None, sub_lang, sub)
297 def _extract_subtitle(self, video_id):
299 Return a list with a tuple:
300 [(error_message, sub_lang, sub)]
302 sub_lang_list = self._get_available_subtitles(video_id)
303 sub_format = self._downloader.params.get('subtitlesformat')
304 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
305 return [(sub_lang_list[0], None, None)]
306 if self._downloader.params.get('subtitleslang', False):
307 sub_lang = self._downloader.params.get('subtitleslang')
308 elif 'en' in sub_lang_list:
311 sub_lang = list(sub_lang_list.keys())[0]
312 if not sub_lang in sub_lang_list:
313 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
315 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
318 def _extract_all_subtitles(self, video_id):
319 sub_lang_list = self._get_available_subtitles(video_id)
320 sub_format = self._downloader.params.get('subtitlesformat')
321 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
322 return [(sub_lang_list[0], None, None)]
324 for sub_lang in sub_lang_list:
325 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
326 subtitles.append(subtitle)
329 def _print_formats(self, formats):
330 print('Available formats:')
332 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
334 def _real_initialize(self):
335 if self._downloader is None:
340 downloader_params = self._downloader.params
342 # Attempt to use provided username and password or .netrc data
343 if downloader_params.get('username', None) is not None:
344 username = downloader_params['username']
345 password = downloader_params['password']
346 elif downloader_params.get('usenetrc', False):
348 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
353 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354 except (IOError, netrc.NetrcParseError) as err:
355 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
359 request = compat_urllib_request.Request(self._LANG_URL)
362 compat_urllib_request.urlopen(request).read()
363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
367 # No authentication to be performed
371 request = compat_urllib_request.Request(self._LOGIN_URL)
373 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
380 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
382 galx = match.group(1)
384 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
390 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
394 u'PersistentCookie': u'yes',
396 u'bgresponse': u'js_disabled',
397 u'checkConnection': u'',
398 u'checkedDomains': u'youtube',
404 u'signIn': u'Sign in',
406 u'service': u'youtube',
410 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
412 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
413 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
414 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
417 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
418 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
419 self._downloader.report_warning(u'unable to log in: bad username or password')
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
428 'action_confirm': 'Confirm',
430 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
432 self.report_age_confirmation()
433 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
438 def _extract_id(self, url):
439 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
441 self._downloader.report_error(u'invalid URL: %s' % url)
443 video_id = mobj.group(2)
446 def _real_extract(self, url):
447 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
448 mobj = re.search(self._NEXT_URL_RE, url)
450 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
451 video_id = self._extract_id(url)
454 self.report_video_webpage_download(video_id)
455 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
456 request = compat_urllib_request.Request(url)
458 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
460 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
463 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
465 # Attempt to extract SWF player URL
466 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
468 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
473 self.report_video_info_webpage_download(video_id)
474 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
475 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
476 % (video_id, el_type))
477 video_info_webpage = self._download_webpage(video_info_url, video_id,
479 errnote='unable to download video info webpage')
480 video_info = compat_parse_qs(video_info_webpage)
481 if 'token' in video_info:
483 if 'token' not in video_info:
484 if 'reason' in video_info:
485 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
487 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
490 # Check for "rental" videos
491 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
492 self._downloader.report_error(u'"rental" videos not supported')
495 # Start extracting information
496 self.report_information_extraction(video_id)
499 if 'author' not in video_info:
500 self._downloader.report_error(u'unable to extract uploader name')
502 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
505 video_uploader_id = None
506 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
508 video_uploader_id = mobj.group(1)
510 self._downloader.report_warning(u'unable to extract uploader nickname')
513 if 'title' not in video_info:
514 self._downloader.report_error(u'unable to extract video title')
516 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
519 if 'thumbnail_url' not in video_info:
520 self._downloader.report_warning(u'unable to extract video thumbnail')
522 else: # don't panic if we can't find it
523 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
527 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
529 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
530 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
531 for expression in format_expressions:
533 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
538 video_description = get_element_by_id("eow-description", video_webpage)
539 if video_description:
540 video_description = clean_html(video_description)
542 video_description = ''
545 video_subtitles = None
547 if self._downloader.params.get('writesubtitles', False):
548 video_subtitles = self._extract_subtitle(video_id)
550 (sub_error, sub_lang, sub) = video_subtitles[0]
552 self._downloader.report_error(sub_error)
554 if self._downloader.params.get('allsubtitles', False):
555 video_subtitles = self._extract_all_subtitles(video_id)
556 for video_subtitle in video_subtitles:
557 (sub_error, sub_lang, sub) = video_subtitle
559 self._downloader.report_error(sub_error)
561 if self._downloader.params.get('listsubtitles', False):
562 sub_lang_list = self._list_available_subtitles(video_id)
565 if 'length_seconds' not in video_info:
566 self._downloader.report_warning(u'unable to extract video duration')
569 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
572 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
574 # Decide which formats to download
575 req_format = self._downloader.params.get('format', None)
577 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
578 self.report_rtmp_download()
579 video_url_list = [(None, video_info['conn'][0])]
580 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
581 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
582 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
583 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
584 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
586 format_limit = self._downloader.params.get('format_limit', None)
587 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
588 if format_limit is not None and format_limit in available_formats:
589 format_list = available_formats[available_formats.index(format_limit):]
591 format_list = available_formats
592 existing_formats = [x for x in format_list if x in url_map]
593 if len(existing_formats) == 0:
594 self._downloader.report_error(u'no known formats available for video')
596 if self._downloader.params.get('listformats', None):
597 self._print_formats(existing_formats)
599 if req_format is None or req_format == 'best':
600 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
601 elif req_format == 'worst':
602 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
603 elif req_format in ('-1', 'all'):
604 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
606 # Specific formats. We pick the first in a slash-delimeted sequence.
607 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
608 req_formats = req_format.split('/')
609 video_url_list = None
610 for rf in req_formats:
612 video_url_list = [(rf, url_map[rf])]
614 if video_url_list is None:
615 self._downloader.report_error(u'requested format not available')
618 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
622 for format_param, video_real_url in video_url_list:
624 video_extension = self._video_extensions.get(format_param, 'flv')
626 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
627 self._video_dimensions.get(format_param, '???'))
631 'url': video_real_url,
632 'uploader': video_uploader,
633 'uploader_id': video_uploader_id,
634 'upload_date': upload_date,
635 'title': video_title,
636 'ext': video_extension,
637 'format': video_format,
638 'thumbnail': video_thumbnail,
639 'description': video_description,
640 'player_url': player_url,
641 'subtitles': video_subtitles,
642 'duration': video_duration
647 class MetacafeIE(InfoExtractor):
648 """Information Extractor for metacafe.com."""
650 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
651 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
652 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
653 IE_NAME = u'metacafe'
655 def __init__(self, downloader=None):
656 InfoExtractor.__init__(self, downloader)
658 def report_disclaimer(self):
659 """Report disclaimer retrieval."""
660 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
662 def report_age_confirmation(self):
663 """Report attempt to confirm age."""
664 self._downloader.to_screen(u'[metacafe] Confirming age')
666 def report_download_webpage(self, video_id):
667 """Report webpage download."""
668 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
670 def report_extraction(self, video_id):
671 """Report information extraction."""
672 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
674 def _real_initialize(self):
675 # Retrieve disclaimer
676 request = compat_urllib_request.Request(self._DISCLAIMER)
678 self.report_disclaimer()
679 disclaimer = compat_urllib_request.urlopen(request).read()
680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
681 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
687 'submit': "Continue - I'm over 18",
689 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
691 self.report_age_confirmation()
692 disclaimer = compat_urllib_request.urlopen(request).read()
693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
694 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
697 def _real_extract(self, url):
698 # Extract id and simplified title from URL
699 mobj = re.match(self._VALID_URL, url)
701 self._downloader.report_error(u'invalid URL: %s' % url)
704 video_id = mobj.group(1)
706 # Check if video comes from YouTube
707 mobj2 = re.match(r'^yt-(.*)$', video_id)
708 if mobj2 is not None:
709 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
712 # Retrieve video webpage to extract further information
713 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
715 self.report_download_webpage(video_id)
716 webpage = compat_urllib_request.urlopen(request).read()
717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
721 # Extract URL, uploader and title from webpage
722 self.report_extraction(video_id)
723 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
725 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
726 video_extension = mediaURL[-3:]
728 # Extract gdaKey if available
729 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
733 gdaKey = mobj.group(1)
734 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
736 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
738 self._downloader.report_error(u'unable to extract media URL')
740 vardict = compat_parse_qs(mobj.group(1))
741 if 'mediaData' not in vardict:
742 self._downloader.report_error(u'unable to extract media URL')
744 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
746 self._downloader.report_error(u'unable to extract media URL')
748 mediaURL = mobj.group(1).replace('\\/', '/')
749 video_extension = mediaURL[-3:]
750 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
752 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
754 self._downloader.report_error(u'unable to extract title')
756 video_title = mobj.group(1).decode('utf-8')
758 mobj = re.search(r'submitter=(.*?);', webpage)
760 self._downloader.report_error(u'unable to extract uploader nickname')
762 video_uploader = mobj.group(1)
765 'id': video_id.decode('utf-8'),
766 'url': video_url.decode('utf-8'),
767 'uploader': video_uploader.decode('utf-8'),
769 'title': video_title,
770 'ext': video_extension.decode('utf-8'),
774 class DailymotionIE(InfoExtractor):
775 """Information Extractor for Dailymotion"""
777 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
778 IE_NAME = u'dailymotion'
781 def __init__(self, downloader=None):
782 InfoExtractor.__init__(self, downloader)
784 def report_extraction(self, video_id):
785 """Report information extraction."""
786 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
788 def _real_extract(self, url):
789 # Extract id and simplified title from URL
790 mobj = re.match(self._VALID_URL, url)
792 self._downloader.report_error(u'invalid URL: %s' % url)
795 video_id = mobj.group(1).split('_')[0].split('?')[0]
797 video_extension = 'mp4'
799 # Retrieve video webpage to extract further information
800 request = compat_urllib_request.Request(url)
801 request.add_header('Cookie', 'family_filter=off')
802 webpage = self._download_webpage(request, video_id)
804 # Extract URL, uploader and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
808 self._downloader.report_error(u'unable to extract media URL')
810 flashvars = compat_urllib_parse.unquote(mobj.group(1))
812 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
815 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
818 self._downloader.report_error(u'unable to extract video URL')
821 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
823 self._downloader.report_error(u'unable to extract video URL')
826 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
828 # TODO: support choosing qualities
830 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
832 self._downloader.report_error(u'unable to extract title')
834 video_title = unescapeHTML(mobj.group('title'))
836 video_uploader = None
837 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
839 # lookin for official user
840 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841 if mobj_official is None:
842 self._downloader.report_warning(u'unable to extract uploader nickname')
844 video_uploader = mobj_official.group(1)
846 video_uploader = mobj.group(1)
848 video_upload_date = None
849 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
851 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
856 'uploader': video_uploader,
857 'upload_date': video_upload_date,
858 'title': video_title,
859 'ext': video_extension,
863 class PhotobucketIE(InfoExtractor):
864 """Information extractor for photobucket.com."""
866 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867 IE_NAME = u'photobucket'
869 def __init__(self, downloader=None):
870 InfoExtractor.__init__(self, downloader)
872 def report_download_webpage(self, video_id):
873 """Report webpage download."""
874 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
876 def report_extraction(self, video_id):
877 """Report information extraction."""
878 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
880 def _real_extract(self, url):
881 # Extract id from URL
882 mobj = re.match(self._VALID_URL, url)
884 self._downloader.report_error(u'Invalid URL: %s' % url)
887 video_id = mobj.group(1)
889 video_extension = 'flv'
891 # Retrieve video webpage to extract further information
892 request = compat_urllib_request.Request(url)
894 self.report_download_webpage(video_id)
895 webpage = compat_urllib_request.urlopen(request).read()
896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
900 # Extract URL, uploader, and title from webpage
901 self.report_extraction(video_id)
902 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
904 self._downloader.report_error(u'unable to extract media URL')
906 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
910 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
912 self._downloader.report_error(u'unable to extract title')
914 video_title = mobj.group(1).decode('utf-8')
916 video_uploader = mobj.group(2).decode('utf-8')
919 'id': video_id.decode('utf-8'),
920 'url': video_url.decode('utf-8'),
921 'uploader': video_uploader,
923 'title': video_title,
924 'ext': video_extension.decode('utf-8'),
928 class YahooIE(InfoExtractor):
929 """Information extractor for video.yahoo.com."""
932 # _VALID_URL matches all Yahoo! Video URLs
933 # _VPAGE_URL matches only the extractable '/watch/' URLs
934 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
935 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
936 IE_NAME = u'video.yahoo'
938 def __init__(self, downloader=None):
939 InfoExtractor.__init__(self, downloader)
941 def report_download_webpage(self, video_id):
942 """Report webpage download."""
943 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
945 def report_extraction(self, video_id):
946 """Report information extraction."""
947 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
949 def _real_extract(self, url, new_video=True):
950 # Extract ID from URL
951 mobj = re.match(self._VALID_URL, url)
953 self._downloader.report_error(u'Invalid URL: %s' % url)
956 video_id = mobj.group(2)
957 video_extension = 'flv'
959 # Rewrite valid but non-extractable URLs as
960 # extractable English language /watch/ URLs
961 if re.match(self._VPAGE_URL, url) is None:
962 request = compat_urllib_request.Request(url)
964 webpage = compat_urllib_request.urlopen(request).read()
965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
966 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
969 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
971 self._downloader.report_error(u'Unable to extract id field')
973 yahoo_id = mobj.group(1)
975 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
977 self._downloader.report_error(u'Unable to extract vid field')
979 yahoo_vid = mobj.group(1)
981 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
982 return self._real_extract(url, new_video=False)
984 # Retrieve video webpage to extract further information
985 request = compat_urllib_request.Request(url)
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract uploader and title from webpage
994 self.report_extraction(video_id)
995 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
997 self._downloader.report_error(u'unable to extract video title')
999 video_title = mobj.group(1).decode('utf-8')
1001 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1003 self._downloader.report_error(u'unable to extract video uploader')
1005 video_uploader = mobj.group(1).decode('utf-8')
1007 # Extract video thumbnail
1008 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1010 self._downloader.report_error(u'unable to extract video thumbnail')
1012 video_thumbnail = mobj.group(1).decode('utf-8')
1014 # Extract video description
1015 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video description')
1019 video_description = mobj.group(1).decode('utf-8')
1020 if not video_description:
1021 video_description = 'No description available.'
1023 # Extract video height and width
1024 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video height')
1028 yv_video_height = mobj.group(1)
1030 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1032 self._downloader.report_error(u'unable to extract video width')
1034 yv_video_width = mobj.group(1)
1036 # Retrieve video playlist to extract media URL
1037 # I'm not completely sure what all these options are, but we
1038 # seem to need most of them, otherwise the server sends a 401.
1039 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1040 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1041 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1045 self.report_download_webpage(video_id)
1046 webpage = compat_urllib_request.urlopen(request).read()
1047 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1051 # Extract media URL from playlist XML
1052 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1054 self._downloader.report_error(u'Unable to extract media URL')
1056 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057 video_url = unescapeHTML(video_url)
1060 'id': video_id.decode('utf-8'),
1062 'uploader': video_uploader,
1063 'upload_date': None,
1064 'title': video_title,
1065 'ext': video_extension.decode('utf-8'),
1066 'thumbnail': video_thumbnail.decode('utf-8'),
1067 'description': video_description,
1071 class VimeoIE(InfoExtractor):
1072 """Information extractor for vimeo.com."""
1074 # _VALID_URL matches Vimeo URLs
1075 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1089 def _real_extract(self, url, new_video=True):
1090 # Extract ID from URL
1091 mobj = re.match(self._VALID_URL, url)
1093 self._downloader.report_error(u'Invalid URL: %s' % url)
1096 video_id = mobj.group('id')
1097 if not mobj.group('proto'):
1098 url = 'https://' + url
1099 if mobj.group('direct_link'):
1100 url = 'https://vimeo.com/' + video_id
1102 # Retrieve video webpage to extract further information
1103 request = compat_urllib_request.Request(url, None, std_headers)
1105 self.report_download_webpage(video_id)
1106 webpage_bytes = compat_urllib_request.urlopen(request).read()
1107 webpage = webpage_bytes.decode('utf-8')
1108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1117 # Extract the config JSON
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1122 self._downloader.report_error(u'unable to extract info section')
1126 video_title = config["video"]["title"]
1128 # Extract uploader and uploader_id
1129 video_uploader = config["video"]["owner"]["name"]
1130 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1132 # Extract video thumbnail
1133 video_thumbnail = config["video"]["thumbnail"]
1135 # Extract video description
1136 video_description = get_element_by_attribute("itemprop", "description", webpage)
1137 if video_description: video_description = clean_html(video_description)
1138 else: video_description = u''
1140 # Extract upload date
1141 video_upload_date = None
1142 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143 if mobj is not None:
1144 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1146 # Vimeo specific: extract request signature and timestamp
1147 sig = config['request']['signature']
1148 timestamp = config['request']['timestamp']
1150 # Vimeo specific: extract video codec and quality information
1151 # First consider quality, then codecs, then take everything
1152 # TODO bind to format param
1153 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154 files = { 'hd': [], 'sd': [], 'other': []}
1155 for codec_name, codec_extension in codecs:
1156 if codec_name in config["video"]["files"]:
1157 if 'hd' in config["video"]["files"][codec_name]:
1158 files['hd'].append((codec_name, codec_extension, 'hd'))
1159 elif 'sd' in config["video"]["files"][codec_name]:
1160 files['sd'].append((codec_name, codec_extension, 'sd'))
1162 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1164 for quality in ('hd', 'sd', 'other'):
1165 if len(files[quality]) > 0:
1166 video_quality = files[quality][0][2]
1167 video_codec = files[quality][0][0]
1168 video_extension = files[quality][0][1]
1169 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172 self._downloader.report_error(u'no known codec found')
1175 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181 'uploader': video_uploader,
1182 'uploader_id': video_uploader_id,
1183 'upload_date': video_upload_date,
1184 'title': video_title,
1185 'ext': video_extension,
1186 'thumbnail': video_thumbnail,
1187 'description': video_description,
1191 class ArteTvIE(InfoExtractor):
1192 """arte.tv information extractor."""
1194 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195 _LIVE_URL = r'index-[0-9]+\.html$'
1197 IE_NAME = u'arte.tv'
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1206 def report_extraction(self, video_id):
1207 """Report information extraction."""
1208 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1210 def fetch_webpage(self, url):
1211 request = compat_urllib_request.Request(url)
1213 self.report_download_webpage(url)
1214 webpage = compat_urllib_request.urlopen(request).read()
1215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1218 except ValueError as err:
1219 self._downloader.report_error(u'Invalid URL: %s' % url)
1223 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224 page = self.fetch_webpage(url)
1225 mobj = re.search(regex, page, regexFlags)
1229 self._downloader.report_error(u'Invalid URL: %s' % url)
1232 for (i, key, err) in matchTuples:
1233 if mobj.group(i) is None:
1234 self._downloader.trouble(err)
1237 info[key] = mobj.group(i)
1241 def extractLiveStream(self, url):
1242 video_lang = url.split('/')[-4]
1243 info = self.grep_webpage(
1245 r'src="(.*?/videothek_js.*?\.js)',
1248 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1251 http_host = url.split('/')[2]
1252 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253 info = self.grep_webpage(
1255 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256 '(http://.*?\.swf).*?' +
1260 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1261 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1265 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1267 def extractPlus7Stream(self, url):
1268 video_lang = url.split('/')[-3]
1269 info = self.grep_webpage(
1271 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1274 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1277 next_url = compat_urllib_parse.unquote(info.get('url'))
1278 info = self.grep_webpage(
1280 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1283 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1286 next_url = compat_urllib_parse.unquote(info.get('url'))
1288 info = self.grep_webpage(
1290 r'<video id="(.*?)".*?>.*?' +
1291 '<name>(.*?)</name>.*?' +
1292 '<dateVideo>(.*?)</dateVideo>.*?' +
1293 '<url quality="hd">(.*?)</url>',
1296 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1297 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1299 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1304 'id': info.get('id'),
1305 'url': compat_urllib_parse.unquote(info.get('url')),
1306 'uploader': u'arte.tv',
1307 'upload_date': info.get('date'),
1308 'title': info.get('title').decode('utf-8'),
1314 def _real_extract(self, url):
1315 video_id = url.split('/')[-1]
1316 self.report_extraction(video_id)
1318 if re.search(self._LIVE_URL, video_id) is not None:
1319 self.extractLiveStream(url)
1322 info = self.extractPlus7Stream(url)
1327 class GenericIE(InfoExtractor):
1328 """Generic last-resort information extractor."""
1331 IE_NAME = u'generic'
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1336 def report_download_webpage(self, video_id):
1337 """Report webpage download."""
1338 if not self._downloader.params.get('test', False):
1339 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1342 def report_extraction(self, video_id):
1343 """Report information extraction."""
1344 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1346 def report_following_redirect(self, new_url):
1347 """Report information extraction."""
1348 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1350 def _test_redirect(self, url):
1351 """Check if it is a redirect, like url shorteners, in case restart chain."""
1352 class HeadRequest(compat_urllib_request.Request):
1353 def get_method(self):
1356 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1358 Subclass the HTTPRedirectHandler to make it use our
1359 HeadRequest also on the redirected URL
1361 def redirect_request(self, req, fp, code, msg, headers, newurl):
1362 if code in (301, 302, 303, 307):
1363 newurl = newurl.replace(' ', '%20')
1364 newheaders = dict((k,v) for k,v in req.headers.items()
1365 if k.lower() not in ("content-length", "content-type"))
1366 return HeadRequest(newurl,
1368 origin_req_host=req.get_origin_req_host(),
1371 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1373 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1375 Fallback to GET if HEAD is not allowed (405 HTTP error)
1377 def http_error_405(self, req, fp, code, msg, headers):
1381 newheaders = dict((k,v) for k,v in req.headers.items()
1382 if k.lower() not in ("content-length", "content-type"))
1383 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1385 origin_req_host=req.get_origin_req_host(),
1389 opener = compat_urllib_request.OpenerDirector()
1390 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391 HTTPMethodFallback, HEADRedirectHandler,
1392 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393 opener.add_handler(handler())
1395 response = opener.open(HeadRequest(url))
1396 new_url = response.geturl()
1401 self.report_following_redirect(new_url)
1402 self._downloader.download([new_url])
1405 def _real_extract(self, url):
1406 if self._test_redirect(url): return
1408 video_id = url.split('/')[-1]
1410 webpage = self._download_webpage(url, video_id)
1411 except ValueError as err:
1412 # since this is the last-resort InfoExtractor, if
1413 # this error is thrown, it'll be thrown here
1414 self._downloader.report_error(u'Invalid URL: %s' % url)
1417 self.report_extraction(video_id)
1418 # Start with something easy: JW Player in SWFObject
1419 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1421 # Broaden the search a little bit
1422 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1424 # Broaden the search a little bit: JWPlayer JS loader
1425 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1427 self._downloader.report_error(u'Invalid URL: %s' % url)
1430 # It's possible that one of the regexes
1431 # matched, but returned an empty group:
1432 if mobj.group(1) is None:
1433 self._downloader.report_error(u'Invalid URL: %s' % url)
1436 video_url = compat_urllib_parse.unquote(mobj.group(1))
1437 video_id = os.path.basename(video_url)
1439 # here's a fun little line of code for you:
1440 video_extension = os.path.splitext(video_id)[1][1:]
1441 video_id = os.path.splitext(video_id)[0]
1443 # it's tempting to parse this further, but you would
1444 # have to take into account all the variations like
1445 # Video Title - Site Name
1446 # Site Name | Video Title
1447 # Video Title - Tagline | Site Name
1448 # and so on and so forth; it's just not practical
1449 mobj = re.search(r'<title>(.*)</title>', webpage)
1451 self._downloader.report_error(u'unable to extract title')
1453 video_title = mobj.group(1)
1455 # video uploader is domain name
1456 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1458 self._downloader.report_error(u'unable to extract title')
1460 video_uploader = mobj.group(1)
1465 'uploader': video_uploader,
1466 'upload_date': None,
1467 'title': video_title,
1468 'ext': video_extension,
1472 class YoutubeSearchIE(InfoExtractor):
1473 """Information Extractor for YouTube search queries."""
1474 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476 _max_youtube_results = 1000
1477 IE_NAME = u'youtube:search'
1479 def __init__(self, downloader=None):
1480 InfoExtractor.__init__(self, downloader)
1482 def report_download_page(self, query, pagenum):
1483 """Report attempt to download search page with given number."""
1484 query = query.decode(preferredencoding())
1485 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1487 def _real_extract(self, query):
1488 mobj = re.match(self._VALID_URL, query)
1490 self._downloader.report_error(u'invalid search query "%s"' % query)
1493 prefix, query = query.split(':')
1495 query = query.encode('utf-8')
1497 self._download_n_results(query, 1)
1499 elif prefix == 'all':
1500 self._download_n_results(query, self._max_youtube_results)
1506 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1508 elif n > self._max_youtube_results:
1509 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510 n = self._max_youtube_results
1511 self._download_n_results(query, n)
1513 except ValueError: # parsing prefix as integer fails
1514 self._download_n_results(query, 1)
1517 def _download_n_results(self, query, n):
1518 """Downloads a specified number of results for a query"""
1524 while (50 * pagenum) < limit:
1525 self.report_download_page(query, pagenum+1)
1526 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527 request = compat_urllib_request.Request(result_url)
1529 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1533 api_response = json.loads(data)['data']
1535 if not 'items' in api_response:
1536 self._downloader.trouble(u'[youtube] No video results')
1539 new_ids = list(video['id'] for video in api_response['items'])
1540 video_ids += new_ids
1542 limit = min(n, api_response['totalItems'])
1545 if len(video_ids) > n:
1546 video_ids = video_ids[:n]
1547 for id in video_ids:
1548 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1552 class GoogleSearchIE(InfoExtractor):
1553 """Information Extractor for Google Video search queries."""
1554 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558 _max_google_results = 1000
1559 IE_NAME = u'video.google:search'
1561 def __init__(self, downloader=None):
1562 InfoExtractor.__init__(self, downloader)
1564 def report_download_page(self, query, pagenum):
1565 """Report attempt to download playlist page with given number."""
1566 query = query.decode(preferredencoding())
1567 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1569 def _real_extract(self, query):
1570 mobj = re.match(self._VALID_URL, query)
1572 self._downloader.report_error(u'invalid search query "%s"' % query)
1575 prefix, query = query.split(':')
1577 query = query.encode('utf-8')
1579 self._download_n_results(query, 1)
1581 elif prefix == 'all':
1582 self._download_n_results(query, self._max_google_results)
1588 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1590 elif n > self._max_google_results:
1591 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592 n = self._max_google_results
1593 self._download_n_results(query, n)
1595 except ValueError: # parsing prefix as integer fails
1596 self._download_n_results(query, 1)
1599 def _download_n_results(self, query, n):
1600 """Downloads a specified number of results for a query"""
1606 self.report_download_page(query, pagenum)
1607 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608 request = compat_urllib_request.Request(result_url)
1610 page = compat_urllib_request.urlopen(request).read()
1611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1615 # Extract video identifiers
1616 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617 video_id = mobj.group(1)
1618 if video_id not in video_ids:
1619 video_ids.append(video_id)
1620 if len(video_ids) == n:
1621 # Specified n videos reached
1622 for id in video_ids:
1623 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1626 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627 for id in video_ids:
1628 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1631 pagenum = pagenum + 1
1634 class YahooSearchIE(InfoExtractor):
1635 """Information Extractor for Yahoo! Video search queries."""
1638 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641 _MORE_PAGES_INDICATOR = r'\s*Next'
1642 _max_yahoo_results = 1000
1643 IE_NAME = u'video.yahoo:search'
1645 def __init__(self, downloader=None):
1646 InfoExtractor.__init__(self, downloader)
1648 def report_download_page(self, query, pagenum):
1649 """Report attempt to download playlist page with given number."""
1650 query = query.decode(preferredencoding())
1651 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1653 def _real_extract(self, query):
1654 mobj = re.match(self._VALID_URL, query)
1656 self._downloader.report_error(u'invalid search query "%s"' % query)
1659 prefix, query = query.split(':')
1661 query = query.encode('utf-8')
1663 self._download_n_results(query, 1)
1665 elif prefix == 'all':
1666 self._download_n_results(query, self._max_yahoo_results)
1672 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1674 elif n > self._max_yahoo_results:
1675 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676 n = self._max_yahoo_results
1677 self._download_n_results(query, n)
1679 except ValueError: # parsing prefix as integer fails
1680 self._download_n_results(query, 1)
1683 def _download_n_results(self, query, n):
1684 """Downloads a specified number of results for a query"""
1687 already_seen = set()
1691 self.report_download_page(query, pagenum)
1692 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693 request = compat_urllib_request.Request(result_url)
1695 page = compat_urllib_request.urlopen(request).read()
1696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1700 # Extract video identifiers
1701 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702 video_id = mobj.group(1)
1703 if video_id not in already_seen:
1704 video_ids.append(video_id)
1705 already_seen.add(video_id)
1706 if len(video_ids) == n:
1707 # Specified n videos reached
1708 for id in video_ids:
1709 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1712 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713 for id in video_ids:
1714 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1717 pagenum = pagenum + 1
1720 class YoutubePlaylistIE(InfoExtractor):
1721 """Information Extractor for YouTube playlists."""
1723 _VALID_URL = r"""(?:
1728 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729 \? (?:.*?&)*? (?:p|a|list)=
1732 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1735 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1737 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1739 IE_NAME = u'youtube:playlist'
1741 def __init__(self, downloader=None):
1742 InfoExtractor.__init__(self, downloader)
1745 def suitable(cls, url):
1746 """Receives a URL and returns True if suitable for this IE."""
1747 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1749 def report_download_page(self, playlist_id, pagenum):
1750 """Report attempt to download playlist page with given number."""
1751 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1753 def _real_extract(self, url):
1754 # Extract playlist id
1755 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1757 self._downloader.report_error(u'invalid url: %s' % url)
1760 # Download playlist videos from API
1761 playlist_id = mobj.group(1) or mobj.group(2)
1766 self.report_download_page(playlist_id, page_num)
1768 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1770 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1776 response = json.loads(page)
1777 except ValueError as err:
1778 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1781 if not 'feed' in response or not 'entry' in response['feed']:
1782 self._downloader.report_error(u'Got a malformed response from YouTube API')
1784 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1785 for entry in response['feed']['entry']
1786 if 'content' in entry ]
1788 if len(response['feed']['entry']) < self._MAX_RESULTS:
1792 videos = [v[1] for v in sorted(videos)]
1795 playliststart = self._downloader.params.get('playliststart', 1) - 1
1796 playlistend = self._downloader.params.get('playlistend', -1)
1797 if playlistend == -1:
1798 videos = videos[playliststart:]
1800 videos = videos[playliststart:playlistend]
1802 if len(videos) == total:
1803 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1805 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1807 for video in videos:
1808 self._downloader.download([video])
1812 class YoutubeChannelIE(InfoExtractor):
1813 """Information Extractor for YouTube channels."""
1815 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1816 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1817 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1818 IE_NAME = u'youtube:channel'
1820 def report_download_page(self, channel_id, pagenum):
1821 """Report attempt to download channel page with given number."""
1822 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1824 def _real_extract(self, url):
1825 # Extract channel id
1826 mobj = re.match(self._VALID_URL, url)
1828 self._downloader.report_error(u'invalid url: %s' % url)
1831 # Download channel pages
1832 channel_id = mobj.group(1)
1837 self.report_download_page(channel_id, pagenum)
1838 url = self._TEMPLATE_URL % (channel_id, pagenum)
1839 request = compat_urllib_request.Request(url)
1841 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1843 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1846 # Extract video identifiers
1848 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1849 if mobj.group(1) not in ids_in_page:
1850 ids_in_page.append(mobj.group(1))
1851 video_ids.extend(ids_in_page)
1853 if self._MORE_PAGES_INDICATOR not in page:
1855 pagenum = pagenum + 1
1857 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1859 for id in video_ids:
1860 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1864 class YoutubeUserIE(InfoExtractor):
1865 """Information Extractor for YouTube users."""
1867 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1868 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1869 _GDATA_PAGE_SIZE = 50
1870 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1871 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1872 IE_NAME = u'youtube:user'
1874 def __init__(self, downloader=None):
1875 InfoExtractor.__init__(self, downloader)
1877 def report_download_page(self, username, start_index):
1878 """Report attempt to download user page."""
1879 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1880 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1882 def _real_extract(self, url):
1884 mobj = re.match(self._VALID_URL, url)
1886 self._downloader.report_error(u'invalid url: %s' % url)
1889 username = mobj.group(1)
1891 # Download video ids using YouTube Data API. Result size per
1892 # query is limited (currently to 50 videos) so we need to query
1893 # page by page until there are no video ids - it means we got
1900 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1901 self.report_download_page(username, start_index)
1903 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1906 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1911 # Extract video identifiers
1914 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915 if mobj.group(1) not in ids_in_page:
1916 ids_in_page.append(mobj.group(1))
1918 video_ids.extend(ids_in_page)
1920 # A little optimization - if current page is not
1921 # "full", ie. does not contain PAGE_SIZE video ids then
1922 # we can assume that this page is the last one - there
1923 # are no more ids on further pages - no need to query
1926 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1931 all_ids_count = len(video_ids)
1932 playliststart = self._downloader.params.get('playliststart', 1) - 1
1933 playlistend = self._downloader.params.get('playlistend', -1)
1935 if playlistend == -1:
1936 video_ids = video_ids[playliststart:]
1938 video_ids = video_ids[playliststart:playlistend]
1940 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1941 (username, all_ids_count, len(video_ids)))
1943 for video_id in video_ids:
1944 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1947 class BlipTVUserIE(InfoExtractor):
1948 """Information Extractor for blip.tv users."""
1950 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1952 IE_NAME = u'blip.tv:user'
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1957 def report_download_page(self, username, pagenum):
1958 """Report attempt to download user page."""
1959 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960 (self.IE_NAME, username, pagenum))
1962 def _real_extract(self, url):
1964 mobj = re.match(self._VALID_URL, url)
1966 self._downloader.report_error(u'invalid url: %s' % url)
1969 username = mobj.group(1)
1971 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1973 request = compat_urllib_request.Request(url)
1976 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977 mobj = re.search(r'data-users-id="([^"]+)"', page)
1978 page_base = page_base % mobj.group(1)
1979 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1984 # Download video ids using BlipTV Ajax calls. Result size per
1985 # query is limited (currently to 12 videos) so we need to query
1986 # page by page until there are no video ids - it means we got
1993 self.report_download_page(username, pagenum)
1994 url = page_base + "&page=" + str(pagenum)
1995 request = compat_urllib_request.Request( url )
1997 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2002 # Extract video identifiers
2005 for mobj in re.finditer(r'href="/([^"]+)"', page):
2006 if mobj.group(1) not in ids_in_page:
2007 ids_in_page.append(unescapeHTML(mobj.group(1)))
2009 video_ids.extend(ids_in_page)
2011 # A little optimization - if current page is not
2012 # "full", ie. does not contain PAGE_SIZE video ids then
2013 # we can assume that this page is the last one - there
2014 # are no more ids on further pages - no need to query
2017 if len(ids_in_page) < self._PAGE_SIZE:
2022 all_ids_count = len(video_ids)
2023 playliststart = self._downloader.params.get('playliststart', 1) - 1
2024 playlistend = self._downloader.params.get('playlistend', -1)
2026 if playlistend == -1:
2027 video_ids = video_ids[playliststart:]
2029 video_ids = video_ids[playliststart:playlistend]
2031 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2032 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2034 for video_id in video_ids:
2035 self._downloader.download([u'http://blip.tv/'+video_id])
2038 class DepositFilesIE(InfoExtractor):
2039 """Information extractor for depositfiles.com"""
2041 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2043 def report_download_webpage(self, file_id):
2044 """Report webpage download."""
2045 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2047 def report_extraction(self, file_id):
2048 """Report information extraction."""
2049 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2051 def _real_extract(self, url):
2052 file_id = url.split('/')[-1]
2053 # Rebuild url in english locale
2054 url = 'http://depositfiles.com/en/files/' + file_id
2056 # Retrieve file webpage with 'Free download' button pressed
2057 free_download_indication = { 'gateway_result' : '1' }
2058 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2060 self.report_download_webpage(file_id)
2061 webpage = compat_urllib_request.urlopen(request).read()
2062 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2063 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2066 # Search for the real file URL
2067 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2068 if (mobj is None) or (mobj.group(1) is None):
2069 # Try to figure out reason of the error.
2070 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2071 if (mobj is not None) and (mobj.group(1) is not None):
2072 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2073 self._downloader.report_error(u'%s' % restriction_message)
2075 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2078 file_url = mobj.group(1)
2079 file_extension = os.path.splitext(file_url)[1][1:]
2081 # Search for file title
2082 mobj = re.search(r'<b title="(.*?)">', webpage)
2084 self._downloader.report_error(u'unable to extract title')
2086 file_title = mobj.group(1).decode('utf-8')
2089 'id': file_id.decode('utf-8'),
2090 'url': file_url.decode('utf-8'),
2092 'upload_date': None,
2093 'title': file_title,
2094 'ext': file_extension.decode('utf-8'),
2098 class FacebookIE(InfoExtractor):
2099 """Information Extractor for Facebook"""
2101 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2102 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2103 _NETRC_MACHINE = 'facebook'
2104 IE_NAME = u'facebook'
2106 def report_login(self):
2107 """Report attempt to log in."""
2108 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2110 def _real_initialize(self):
2111 if self._downloader is None:
2116 downloader_params = self._downloader.params
2118 # Attempt to use provided username and password or .netrc data
2119 if downloader_params.get('username', None) is not None:
2120 useremail = downloader_params['username']
2121 password = downloader_params['password']
2122 elif downloader_params.get('usenetrc', False):
2124 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2125 if info is not None:
2129 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2130 except (IOError, netrc.NetrcParseError) as err:
2131 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2134 if useremail is None:
2143 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2146 login_results = compat_urllib_request.urlopen(request).read()
2147 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2148 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2150 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2154 def _real_extract(self, url):
2155 mobj = re.match(self._VALID_URL, url)
2157 self._downloader.report_error(u'invalid URL: %s' % url)
2159 video_id = mobj.group('ID')
2161 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2162 webpage = self._download_webpage(url, video_id)
2164 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2165 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2166 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2168 raise ExtractorError(u'Cannot parse data')
2169 data = dict(json.loads(m.group(1)))
2170 params_raw = compat_urllib_parse.unquote(data['params'])
2171 params = json.loads(params_raw)
2172 video_data = params['video_data'][0]
2173 video_url = video_data.get('hd_src')
2175 video_url = video_data['sd_src']
2177 raise ExtractorError(u'Cannot find video URL')
2178 video_duration = int(video_data['video_duration'])
2179 thumbnail = video_data['thumbnail_src']
2181 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2183 raise ExtractorError(u'Cannot find title in webpage')
2184 video_title = unescapeHTML(m.group(1))
2188 'title': video_title,
2191 'duration': video_duration,
2192 'thumbnail': thumbnail,
2197 class BlipTVIE(InfoExtractor):
2198 """Information extractor for blip.tv"""
2200 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2201 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2202 IE_NAME = u'blip.tv'
2204 def report_extraction(self, file_id):
2205 """Report information extraction."""
2206 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2208 def report_direct_download(self, title):
2209 """Report information extraction."""
2210 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2212 def _real_extract(self, url):
2213 mobj = re.match(self._VALID_URL, url)
2215 self._downloader.report_error(u'invalid URL: %s' % url)
2218 urlp = compat_urllib_parse_urlparse(url)
2219 if urlp.path.startswith('/play/'):
2220 request = compat_urllib_request.Request(url)
2221 response = compat_urllib_request.urlopen(request)
2222 redirecturl = response.geturl()
2223 rurlp = compat_urllib_parse_urlparse(redirecturl)
2224 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2225 url = 'http://blip.tv/a/a-' + file_id
2226 return self._real_extract(url)
2233 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2234 request = compat_urllib_request.Request(json_url)
2235 request.add_header('User-Agent', 'iTunes/10.6.1')
2236 self.report_extraction(mobj.group(1))
2239 urlh = compat_urllib_request.urlopen(request)
2240 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2241 basename = url.split('/')[-1]
2242 title,ext = os.path.splitext(basename)
2243 title = title.decode('UTF-8')
2244 ext = ext.replace('.', '')
2245 self.report_direct_download(title)
2250 'upload_date': None,
2255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2256 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2257 if info is None: # Regular URL
2259 json_code_bytes = urlh.read()
2260 json_code = json_code_bytes.decode('utf-8')
2261 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2266 json_data = json.loads(json_code)
2267 if 'Post' in json_data:
2268 data = json_data['Post']
2272 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2273 video_url = data['media']['url']
2274 umobj = re.match(self._URL_EXT, video_url)
2276 raise ValueError('Can not determine filename extension')
2277 ext = umobj.group(1)
2280 'id': data['item_id'],
2282 'uploader': data['display_name'],
2283 'upload_date': upload_date,
2284 'title': data['title'],
2286 'format': data['media']['mimeType'],
2287 'thumbnail': data['thumbnailUrl'],
2288 'description': data['description'],
2289 'player_url': data['embedUrl'],
2290 'user_agent': 'iTunes/10.6.1',
2292 except (ValueError,KeyError) as err:
2293 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2299 class MyVideoIE(InfoExtractor):
2300 """Information Extractor for myvideo.de."""
2302 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2303 IE_NAME = u'myvideo'
2305 def __init__(self, downloader=None):
2306 InfoExtractor.__init__(self, downloader)
2308 def report_extraction(self, video_id):
2309 """Report information extraction."""
2310 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2312 def _real_extract(self,url):
2313 mobj = re.match(self._VALID_URL, url)
2315 self._download.report_error(u'invalid URL: %s' % url)
2318 video_id = mobj.group(1)
2321 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2322 webpage = self._download_webpage(webpage_url, video_id)
2324 self.report_extraction(video_id)
2325 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2328 self._downloader.report_error(u'unable to extract media URL')
2330 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2332 mobj = re.search('<title>([^<]+)</title>', webpage)
2334 self._downloader.report_error(u'unable to extract title')
2337 video_title = mobj.group(1)
2343 'upload_date': None,
2344 'title': video_title,
2348 class ComedyCentralIE(InfoExtractor):
2349 """Information extractor for The Daily Show and Colbert Report """
2351 # urls can be abbreviations like :thedailyshow or :colbert
2352 # urls for episodes like:
2353 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2354 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2355 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2356 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2357 |(https?://)?(www\.)?
2358 (?P<showname>thedailyshow|colbertnation)\.com/
2359 (full-episodes/(?P<episode>.*)|
2361 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2362 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2365 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2367 _video_extensions = {
2375 _video_dimensions = {
2385 def suitable(cls, url):
2386 """Receives a URL and returns True if suitable for this IE."""
2387 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2389 def report_extraction(self, episode_id):
2390 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2392 def report_config_download(self, episode_id, media_id):
2393 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2395 def report_index_download(self, episode_id):
2396 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2398 def _print_formats(self, formats):
2399 print('Available formats:')
2401 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2404 def _real_extract(self, url):
2405 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2407 self._downloader.report_error(u'invalid URL: %s' % url)
2410 if mobj.group('shortname'):
2411 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2412 url = u'http://www.thedailyshow.com/full-episodes/'
2414 url = u'http://www.colbertnation.com/full-episodes/'
2415 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2416 assert mobj is not None
2418 if mobj.group('clip'):
2419 if mobj.group('showname') == 'thedailyshow':
2420 epTitle = mobj.group('tdstitle')
2422 epTitle = mobj.group('cntitle')
2425 dlNewest = not mobj.group('episode')
2427 epTitle = mobj.group('showname')
2429 epTitle = mobj.group('episode')
2431 req = compat_urllib_request.Request(url)
2432 self.report_extraction(epTitle)
2434 htmlHandle = compat_urllib_request.urlopen(req)
2435 html = htmlHandle.read()
2436 webpage = html.decode('utf-8')
2437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2438 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2441 url = htmlHandle.geturl()
2442 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2444 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2446 if mobj.group('episode') == '':
2447 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2449 epTitle = mobj.group('episode')
2451 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2453 if len(mMovieParams) == 0:
2454 # The Colbert Report embeds the information in a without
2455 # a URL prefix; so extract the alternate reference
2456 # and then add the URL prefix manually.
2458 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2459 if len(altMovieParams) == 0:
2460 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2463 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2465 uri = mMovieParams[0][1]
2466 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2467 self.report_index_download(epTitle)
2469 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2476 idoc = xml.etree.ElementTree.fromstring(indexXml)
2477 itemEls = idoc.findall('.//item')
2478 for partNum,itemEl in enumerate(itemEls):
2479 mediaId = itemEl.findall('./guid')[0].text
2480 shortMediaId = mediaId.split(':')[-1]
2481 showId = mediaId.split(':')[-2].replace('.com', '')
2482 officialTitle = itemEl.findall('./title')[0].text
2483 officialDate = itemEl.findall('./pubDate')[0].text
2485 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2486 compat_urllib_parse.urlencode({'uri': mediaId}))
2487 configReq = compat_urllib_request.Request(configUrl)
2488 self.report_config_download(epTitle, shortMediaId)
2490 configXml = compat_urllib_request.urlopen(configReq).read()
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2495 cdoc = xml.etree.ElementTree.fromstring(configXml)
2497 for rendition in cdoc.findall('.//rendition'):
2498 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2502 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2505 if self._downloader.params.get('listformats', None):
2506 self._print_formats([i[0] for i in turls])
2509 # For now, just pick the highest bitrate
2510 format,rtmp_video_url = turls[-1]
2512 # Get the format arg from the arg stream
2513 req_format = self._downloader.params.get('format', None)
2515 # Select format if we can find one
2518 format, rtmp_video_url = f, v
2521 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2523 raise ExtractorError(u'Cannot transform RTMP url')
2524 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2525 video_url = base + m.group('finalid')
2527 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2532 'upload_date': officialDate,
2537 'description': officialTitle,
2539 results.append(info)
2544 class EscapistIE(InfoExtractor):
2545 """Information extractor for The Escapist """
2547 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2548 IE_NAME = u'escapist'
2550 def report_extraction(self, showName):
2551 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2553 def report_config_download(self, showName):
2554 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2556 def _real_extract(self, url):
2557 mobj = re.match(self._VALID_URL, url)
2559 self._downloader.report_error(u'invalid URL: %s' % url)
2561 showName = mobj.group('showname')
2562 videoId = mobj.group('episode')
2564 self.report_extraction(showName)
2566 webPage = compat_urllib_request.urlopen(url)
2567 webPageBytes = webPage.read()
2568 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2569 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2570 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2574 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2575 description = unescapeHTML(descMatch.group(1))
2576 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2577 imgUrl = unescapeHTML(imgMatch.group(1))
2578 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2579 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2580 configUrlMatch = re.search('config=(.*)$', playerUrl)
2581 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2583 self.report_config_download(showName)
2585 configJSON = compat_urllib_request.urlopen(configUrl)
2586 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2587 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2589 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2592 # Technically, it's JavaScript, not JSON
2593 configJSON = configJSON.replace("'", '"')
2596 config = json.loads(configJSON)
2597 except (ValueError,) as err:
2598 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2601 playlist = config['playlist']
2602 videoUrl = playlist[1]['url']
2607 'uploader': showName,
2608 'upload_date': None,
2611 'thumbnail': imgUrl,
2612 'description': description,
2613 'player_url': playerUrl,
2618 class CollegeHumorIE(InfoExtractor):
2619 """Information extractor for collegehumor.com"""
2622 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2623 IE_NAME = u'collegehumor'
2625 def report_manifest(self, video_id):
2626 """Report information extraction."""
2627 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2629 def report_extraction(self, video_id):
2630 """Report information extraction."""
2631 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2633 def _real_extract(self, url):
2634 mobj = re.match(self._VALID_URL, url)
2636 self._downloader.report_error(u'invalid URL: %s' % url)
2638 video_id = mobj.group('videoid')
2643 'upload_date': None,
2646 self.report_extraction(video_id)
2647 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2649 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2650 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2654 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2656 videoNode = mdoc.findall('./video')[0]
2657 info['description'] = videoNode.findall('./description')[0].text
2658 info['title'] = videoNode.findall('./caption')[0].text
2659 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2660 manifest_url = videoNode.findall('./file')[0].text
2662 self._downloader.report_error(u'Invalid metadata XML file')
2665 manifest_url += '?hdcore=2.10.3'
2666 self.report_manifest(video_id)
2668 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2670 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2673 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2675 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2676 node_id = media_node.attrib['url']
2677 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2678 except IndexError as err:
2679 self._downloader.report_error(u'Invalid manifest file')
2682 url_pr = compat_urllib_parse_urlparse(manifest_url)
2683 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2690 class XVideosIE(InfoExtractor):
2691 """Information extractor for xvideos.com"""
2693 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2694 IE_NAME = u'xvideos'
2696 def report_extraction(self, video_id):
2697 """Report information extraction."""
2698 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2700 def _real_extract(self, url):
2701 mobj = re.match(self._VALID_URL, url)
2703 self._downloader.report_error(u'invalid URL: %s' % url)
2705 video_id = mobj.group(1)
2707 webpage = self._download_webpage(url, video_id)
2709 self.report_extraction(video_id)
2713 mobj = re.search(r'flv_url=(.+?)&', webpage)
2715 self._downloader.report_error(u'unable to extract video url')
2717 video_url = compat_urllib_parse.unquote(mobj.group(1))
2721 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2723 self._downloader.report_error(u'unable to extract video title')
2725 video_title = mobj.group(1)
2728 # Extract video thumbnail
2729 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2731 self._downloader.report_error(u'unable to extract video thumbnail')
2733 video_thumbnail = mobj.group(0)
2739 'upload_date': None,
2740 'title': video_title,
2742 'thumbnail': video_thumbnail,
2743 'description': None,
2749 class SoundcloudIE(InfoExtractor):
2750 """Information extractor for soundcloud.com
2751 To access the media, the uid of the song and a stream token
2752 must be extracted from the page source and the script must make
2753 a request to media.soundcloud.com/crossdomain.xml. Then
2754 the media can be grabbed by requesting from an url composed
2755 of the stream token and uid
2758 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2759 IE_NAME = u'soundcloud'
2761 def __init__(self, downloader=None):
2762 InfoExtractor.__init__(self, downloader)
2764 def report_resolve(self, video_id):
2765 """Report information extraction."""
2766 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2768 def report_extraction(self, video_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2772 def _real_extract(self, url):
2773 mobj = re.match(self._VALID_URL, url)
2775 self._downloader.report_error(u'invalid URL: %s' % url)
2778 # extract uploader (which is in the url)
2779 uploader = mobj.group(1)
2780 # extract simple title (uploader + slug of song title)
2781 slug_title = mobj.group(2)
2782 simple_title = uploader + u'-' + slug_title
2784 self.report_resolve('%s/%s' % (uploader, slug_title))
2786 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2787 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2788 request = compat_urllib_request.Request(resolv_url)
2790 info_json_bytes = compat_urllib_request.urlopen(request).read()
2791 info_json = info_json_bytes.decode('utf-8')
2792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2793 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2796 info = json.loads(info_json)
2797 video_id = info['id']
2798 self.report_extraction('%s/%s' % (uploader, slug_title))
2800 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2801 request = compat_urllib_request.Request(streams_url)
2803 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2804 stream_json = stream_json_bytes.decode('utf-8')
2805 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2806 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2809 streams = json.loads(stream_json)
2810 mediaURL = streams['http_mp3_128_url']
2815 'uploader': info['user']['username'],
2816 'upload_date': info['created_at'],
2817 'title': info['title'],
2819 'description': info['description'],
2822 class SoundcloudSetIE(InfoExtractor):
2823 """Information extractor for soundcloud.com sets
2824 To access the media, the uid of the song and a stream token
2825 must be extracted from the page source and the script must make
2826 a request to media.soundcloud.com/crossdomain.xml. Then
2827 the media can be grabbed by requesting from an url composed
2828 of the stream token and uid
2831 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2832 IE_NAME = u'soundcloud'
2834 def __init__(self, downloader=None):
2835 InfoExtractor.__init__(self, downloader)
2837 def report_resolve(self, video_id):
2838 """Report information extraction."""
2839 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2841 def report_extraction(self, video_id):
2842 """Report information extraction."""
2843 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2845 def _real_extract(self, url):
2846 mobj = re.match(self._VALID_URL, url)
2848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2851 # extract uploader (which is in the url)
2852 uploader = mobj.group(1)
2853 # extract simple title (uploader + slug of song title)
2854 slug_title = mobj.group(2)
2855 simple_title = uploader + u'-' + slug_title
2857 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2859 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2860 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2861 request = compat_urllib_request.Request(resolv_url)
2863 info_json_bytes = compat_urllib_request.urlopen(request).read()
2864 info_json = info_json_bytes.decode('utf-8')
2865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2870 info = json.loads(info_json)
2871 if 'errors' in info:
2872 for err in info['errors']:
2873 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2876 for track in info['tracks']:
2877 video_id = track['id']
2878 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2880 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2881 request = compat_urllib_request.Request(streams_url)
2883 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2884 stream_json = stream_json_bytes.decode('utf-8')
2885 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2886 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2889 streams = json.loads(stream_json)
2890 mediaURL = streams['http_mp3_128_url']
2895 'uploader': track['user']['username'],
2896 'upload_date': track['created_at'],
2897 'title': track['title'],
2899 'description': track['description'],
2904 class InfoQIE(InfoExtractor):
2905 """Information extractor for infoq.com"""
2906 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2908 def report_extraction(self, video_id):
2909 """Report information extraction."""
2910 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2912 def _real_extract(self, url):
2913 mobj = re.match(self._VALID_URL, url)
2915 self._downloader.report_error(u'invalid URL: %s' % url)
2918 webpage = self._download_webpage(url, video_id=url)
2919 self.report_extraction(url)
2922 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2924 self._downloader.report_error(u'unable to extract video url')
2926 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2927 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2930 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2932 self._downloader.report_error(u'unable to extract video title')
2934 video_title = mobj.group(1)
2936 # Extract description
2937 video_description = u'No description available.'
2938 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2939 if mobj is not None:
2940 video_description = mobj.group(1)
2942 video_filename = video_url.split('/')[-1]
2943 video_id, extension = video_filename.split('.')
2949 'upload_date': None,
2950 'title': video_title,
2951 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2953 'description': video_description,
2958 class MixcloudIE(InfoExtractor):
2959 """Information extractor for www.mixcloud.com"""
2961 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2962 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2963 IE_NAME = u'mixcloud'
2965 def __init__(self, downloader=None):
2966 InfoExtractor.__init__(self, downloader)
2968 def report_download_json(self, file_id):
2969 """Report JSON download."""
2970 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2972 def report_extraction(self, file_id):
2973 """Report information extraction."""
2974 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2976 def get_urls(self, jsonData, fmt, bitrate='best'):
2977 """Get urls from 'audio_formats' section in json"""
2980 bitrate_list = jsonData[fmt]
2981 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2982 bitrate = max(bitrate_list) # select highest
2984 url_list = jsonData[fmt][bitrate]
2985 except TypeError: # we have no bitrate info.
2986 url_list = jsonData[fmt]
2989 def check_urls(self, url_list):
2990 """Returns 1st active url from list"""
2991 for url in url_list:
2993 compat_urllib_request.urlopen(url)
2995 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3000 def _print_formats(self, formats):
3001 print('Available formats:')
3002 for fmt in formats.keys():
3003 for b in formats[fmt]:
3005 ext = formats[fmt][b][0]
3006 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3007 except TypeError: # we have no bitrate info
3008 ext = formats[fmt][0]
3009 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3012 def _real_extract(self, url):
3013 mobj = re.match(self._VALID_URL, url)
3015 self._downloader.report_error(u'invalid URL: %s' % url)
3017 # extract uploader & filename from url
3018 uploader = mobj.group(1).decode('utf-8')
3019 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3021 # construct API request
3022 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3023 # retrieve .json file with links to files
3024 request = compat_urllib_request.Request(file_url)
3026 self.report_download_json(file_url)
3027 jsonData = compat_urllib_request.urlopen(request).read()
3028 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3029 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3033 json_data = json.loads(jsonData)
3034 player_url = json_data['player_swf_url']
3035 formats = dict(json_data['audio_formats'])
3037 req_format = self._downloader.params.get('format', None)
3040 if self._downloader.params.get('listformats', None):
3041 self._print_formats(formats)
3044 if req_format is None or req_format == 'best':
3045 for format_param in formats.keys():
3046 url_list = self.get_urls(formats, format_param)
3048 file_url = self.check_urls(url_list)
3049 if file_url is not None:
3052 if req_format not in formats:
3053 self._downloader.report_error(u'format is not available')
3056 url_list = self.get_urls(formats, req_format)
3057 file_url = self.check_urls(url_list)
3058 format_param = req_format
3061 'id': file_id.decode('utf-8'),
3062 'url': file_url.decode('utf-8'),
3063 'uploader': uploader.decode('utf-8'),
3064 'upload_date': None,
3065 'title': json_data['name'],
3066 'ext': file_url.split('.')[-1].decode('utf-8'),
3067 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3068 'thumbnail': json_data['thumbnail_url'],
3069 'description': json_data['description'],
3070 'player_url': player_url.decode('utf-8'),
3073 class StanfordOpenClassroomIE(InfoExtractor):
3074 """Information extractor for Stanford's Open ClassRoom"""
3076 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3077 IE_NAME = u'stanfordoc'
3079 def report_download_webpage(self, objid):
3080 """Report information extraction."""
3081 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3083 def report_extraction(self, video_id):
3084 """Report information extraction."""
3085 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3087 def _real_extract(self, url):
3088 mobj = re.match(self._VALID_URL, url)
3090 raise ExtractorError(u'Invalid URL: %s' % url)
3092 if mobj.group('course') and mobj.group('video'): # A specific video
3093 course = mobj.group('course')
3094 video = mobj.group('video')
3096 'id': course + '_' + video,
3098 'upload_date': None,
3101 self.report_extraction(info['id'])
3102 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3103 xmlUrl = baseUrl + video + '.xml'
3105 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3106 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3107 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3109 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3111 info['title'] = mdoc.findall('./title')[0].text
3112 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3114 self._downloader.report_error(u'Invalid metadata XML file')
3116 info['ext'] = info['url'].rpartition('.')[2]
3118 elif mobj.group('course'): # A course page
3119 course = mobj.group('course')
3124 'upload_date': None,
3127 coursepage = self._download_webpage(url, info['id'],
3128 note='Downloading course info page',
3129 errnote='Unable to download course info page')
3131 m = re.search('<h1>([^<]+)</h1>', coursepage)
3133 info['title'] = unescapeHTML(m.group(1))
3135 info['title'] = info['id']
3137 m = re.search('<description>([^<]+)</description>', coursepage)
3139 info['description'] = unescapeHTML(m.group(1))
3141 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3144 'type': 'reference',
3145 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3149 for entry in info['list']:
3150 assert entry['type'] == 'reference'
3151 results += self.extract(entry['url'])
3155 'id': 'Stanford OpenClassroom',
3158 'upload_date': None,
3161 self.report_download_webpage(info['id'])
3162 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3164 rootpage = compat_urllib_request.urlopen(rootURL).read()
3165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3169 info['title'] = info['id']
3171 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3174 'type': 'reference',
3175 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3180 for entry in info['list']:
3181 assert entry['type'] == 'reference'
3182 results += self.extract(entry['url'])
3185 class MTVIE(InfoExtractor):
3186 """Information extractor for MTV.com"""
3188 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3191 def report_extraction(self, video_id):
3192 """Report information extraction."""
3193 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3195 def _real_extract(self, url):
3196 mobj = re.match(self._VALID_URL, url)
3198 self._downloader.report_error(u'invalid URL: %s' % url)
3200 if not mobj.group('proto'):
3201 url = 'http://' + url
3202 video_id = mobj.group('videoid')
3204 webpage = self._download_webpage(url, video_id)
3206 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3208 self._downloader.report_error(u'unable to extract song name')
3210 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3213 self._downloader.report_error(u'unable to extract performer')
3215 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3216 video_title = performer + ' - ' + song_name
3218 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3220 self._downloader.report_error(u'unable to mtvn_uri')
3222 mtvn_uri = mobj.group(1)
3224 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3226 self._downloader.report_error(u'unable to extract content id')
3228 content_id = mobj.group(1)
3230 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3231 self.report_extraction(video_id)
3232 request = compat_urllib_request.Request(videogen_url)
3234 metadataXml = compat_urllib_request.urlopen(request).read()
3235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3239 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3240 renditions = mdoc.findall('.//rendition')
3242 # For now, always pick the highest quality.
3243 rendition = renditions[-1]
3246 _,_,ext = rendition.attrib['type'].partition('/')
3247 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3248 video_url = rendition.find('./src').text
3250 self._downloader.trouble('Invalid rendition field.')
3256 'uploader': performer,
3257 'upload_date': None,
3258 'title': video_title,
3266 class YoukuIE(InfoExtractor):
3267 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3269 def report_download_webpage(self, file_id):
3270 """Report webpage download."""
3271 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3273 def report_extraction(self, file_id):
3274 """Report information extraction."""
3275 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3278 nowTime = int(time.time() * 1000)
3279 random1 = random.randint(1000,1998)
3280 random2 = random.randint(1000,9999)
3282 return "%d%d%d" %(nowTime,random1,random2)
3284 def _get_file_ID_mix_string(self, seed):
3286 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3288 for i in range(len(source)):
3289 seed = (seed * 211 + 30031 ) % 65536
3290 index = math.floor(seed / 65536 * len(source) )
3291 mixed.append(source[int(index)])
3292 source.remove(source[int(index)])
3293 #return ''.join(mixed)
3296 def _get_file_id(self, fileId, seed):
3297 mixed = self._get_file_ID_mix_string(seed)
3298 ids = fileId.split('*')
3302 realId.append(mixed[int(ch)])
3303 return ''.join(realId)
3305 def _real_extract(self, url):
3306 mobj = re.match(self._VALID_URL, url)
3308 self._downloader.report_error(u'invalid URL: %s' % url)
3310 video_id = mobj.group('ID')
3312 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3314 request = compat_urllib_request.Request(info_url, None, std_headers)
3316 self.report_download_webpage(video_id)
3317 jsondata = compat_urllib_request.urlopen(request).read()
3318 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3322 self.report_extraction(video_id)
3324 jsonstr = jsondata.decode('utf-8')
3325 config = json.loads(jsonstr)
3327 video_title = config['data'][0]['title']
3328 seed = config['data'][0]['seed']
3330 format = self._downloader.params.get('format', None)
3331 supported_format = list(config['data'][0]['streamfileids'].keys())
3333 if format is None or format == 'best':
3334 if 'hd2' in supported_format:
3339 elif format == 'worst':
3347 fileid = config['data'][0]['streamfileids'][format]
3348 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3349 except (UnicodeDecodeError, ValueError, KeyError):
3350 self._downloader.report_error(u'unable to extract info section')
3354 sid = self._gen_sid()
3355 fileid = self._get_file_id(fileid, seed)
3357 #column 8,9 of fileid represent the segment number
3358 #fileid[7:9] should be changed
3359 for index, key in enumerate(keys):
3361 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3362 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3365 'id': '%s_part%02d' % (video_id, index),
3366 'url': download_url,
3368 'upload_date': None,
3369 'title': video_title,
3372 files_info.append(info)
3377 class XNXXIE(InfoExtractor):
3378 """Information extractor for xnxx.com"""
3380 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3382 VIDEO_URL_RE = r'flv_url=(.*?)&'
3383 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3384 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3386 def report_webpage(self, video_id):
3387 """Report information extraction"""
3388 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3390 def report_extraction(self, video_id):
3391 """Report information extraction"""
3392 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3394 def _real_extract(self, url):
3395 mobj = re.match(self._VALID_URL, url)
3397 self._downloader.report_error(u'invalid URL: %s' % url)
3399 video_id = mobj.group(1)
3401 self.report_webpage(video_id)
3403 # Get webpage content
3405 webpage_bytes = compat_urllib_request.urlopen(url).read()
3406 webpage = webpage_bytes.decode('utf-8')
3407 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3408 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3411 result = re.search(self.VIDEO_URL_RE, webpage)
3413 self._downloader.report_error(u'unable to extract video url')
3415 video_url = compat_urllib_parse.unquote(result.group(1))
3417 result = re.search(self.VIDEO_TITLE_RE, webpage)
3419 self._downloader.report_error(u'unable to extract video title')
3421 video_title = result.group(1)
3423 result = re.search(self.VIDEO_THUMB_RE, webpage)
3425 self._downloader.report_error(u'unable to extract video thumbnail')
3427 video_thumbnail = result.group(1)
3433 'upload_date': None,
3434 'title': video_title,
3436 'thumbnail': video_thumbnail,
3437 'description': None,
3441 class GooglePlusIE(InfoExtractor):
3442 """Information extractor for plus.google.com."""
3444 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3445 IE_NAME = u'plus.google'
3447 def __init__(self, downloader=None):
3448 InfoExtractor.__init__(self, downloader)
3450 def report_extract_entry(self, url):
3451 """Report downloading extry"""
3452 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3454 def report_date(self, upload_date):
3455 """Report downloading extry"""
3456 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3458 def report_uploader(self, uploader):
3459 """Report downloading extry"""
3460 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3462 def report_title(self, video_title):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3466 def report_extract_vid_page(self, video_page):
3467 """Report information extraction."""
3468 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3470 def _real_extract(self, url):
3471 # Extract id from URL
3472 mobj = re.match(self._VALID_URL, url)
3474 self._downloader.report_error(u'Invalid URL: %s' % url)
3477 post_url = mobj.group(0)
3478 video_id = mobj.group(1)
3480 video_extension = 'flv'
3482 # Step 1, Retrieve post webpage to extract further information
3483 self.report_extract_entry(post_url)
3484 request = compat_urllib_request.Request(post_url)
3486 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3488 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3491 # Extract update date
3493 pattern = 'title="Timestamp">(.*?)</a>'
3494 mobj = re.search(pattern, webpage)
3496 upload_date = mobj.group(1)
3497 # Convert timestring to a format suitable for filename
3498 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3499 upload_date = upload_date.strftime('%Y%m%d')
3500 self.report_date(upload_date)
3504 pattern = r'rel\="author".*?>(.*?)</a>'
3505 mobj = re.search(pattern, webpage)
3507 uploader = mobj.group(1)
3508 self.report_uploader(uploader)
3511 # Get the first line for title
3513 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3514 mobj = re.search(pattern, webpage)
3516 video_title = mobj.group(1)
3517 self.report_title(video_title)
3519 # Step 2, Stimulate clicking the image box to launch video
3520 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3521 mobj = re.search(pattern, webpage)
3523 self._downloader.report_error(u'unable to extract video page URL')
3525 video_page = mobj.group(1)
3526 request = compat_urllib_request.Request(video_page)
3528 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3529 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3530 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3532 self.report_extract_vid_page(video_page)
3535 # Extract video links on video page
3536 """Extract video links of all sizes"""
3537 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3538 mobj = re.findall(pattern, webpage)
3540 self._downloader.report_error(u'unable to extract video links')
3542 # Sort in resolution
3543 links = sorted(mobj)
3545 # Choose the lowest of the sort, i.e. highest resolution
3546 video_url = links[-1]
3547 # Only get the url. The resolution part in the tuple has no use anymore
3548 video_url = video_url[-1]
3549 # Treat escaped \u0026 style hex
3551 video_url = video_url.decode("unicode_escape")
3552 except AttributeError: # Python 3
3553 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3559 'uploader': uploader,
3560 'upload_date': upload_date,
3561 'title': video_title,
3562 'ext': video_extension,
3565 class NBAIE(InfoExtractor):
3566 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3569 def _real_extract(self, url):
3570 mobj = re.match(self._VALID_URL, url)
3572 self._downloader.report_error(u'invalid URL: %s' % url)
3575 video_id = mobj.group(1)
3576 if video_id.endswith('/index.html'):
3577 video_id = video_id[:-len('/index.html')]
3579 webpage = self._download_webpage(url, video_id)
3581 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3582 def _findProp(rexp, default=None):
3583 m = re.search(rexp, webpage)
3585 return unescapeHTML(m.group(1))
3589 shortened_video_id = video_id.rpartition('/')[2]
3590 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3592 'id': shortened_video_id,
3596 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3597 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3601 class JustinTVIE(InfoExtractor):
3602 """Information extractor for justin.tv and twitch.tv"""
3603 # TODO: One broadcast may be split into multiple videos. The key
3604 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3605 # starts at 1 and increases. Can we treat all parts as one video?
3607 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3608 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3609 _JUSTIN_PAGE_LIMIT = 100
3610 IE_NAME = u'justin.tv'
3612 def report_extraction(self, file_id):
3613 """Report information extraction."""
3614 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3616 def report_download_page(self, channel, offset):
3617 """Report attempt to download a single page of videos."""
3618 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3619 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3621 # Return count of items, list of *valid* items
3622 def _parse_page(self, url):
3624 urlh = compat_urllib_request.urlopen(url)
3625 webpage_bytes = urlh.read()
3626 webpage = webpage_bytes.decode('utf-8', 'ignore')
3627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3628 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3631 response = json.loads(webpage)
3632 if type(response) != list:
3633 error_text = response.get('error', 'unknown error')
3634 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3637 for clip in response:
3638 video_url = clip['video_file_url']
3640 video_extension = os.path.splitext(video_url)[1][1:]
3641 video_date = re.sub('-', '', clip['start_time'][:10])
3642 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3643 video_id = clip['id']
3644 video_title = clip.get('title', video_id)
3648 'title': video_title,
3649 'uploader': clip.get('channel_name', video_uploader_id),
3650 'uploader_id': video_uploader_id,
3651 'upload_date': video_date,
3652 'ext': video_extension,
3654 return (len(response), info)
3656 def _real_extract(self, url):
3657 mobj = re.match(self._VALID_URL, url)
3659 self._downloader.report_error(u'invalid URL: %s' % url)
3662 api = 'http://api.justin.tv'
3663 video_id = mobj.group(mobj.lastindex)
3665 if mobj.lastindex == 1:
3667 api += '/channel/archives/%s.json'
3669 api += '/broadcast/by_archive/%s.json'
3670 api = api % (video_id,)
3672 self.report_extraction(video_id)
3676 limit = self._JUSTIN_PAGE_LIMIT
3679 self.report_download_page(video_id, offset)
3680 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3681 page_count, page_info = self._parse_page(page_url)
3682 info.extend(page_info)
3683 if not paged or page_count != limit:
3688 class FunnyOrDieIE(InfoExtractor):
3689 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3691 def _real_extract(self, url):
3692 mobj = re.match(self._VALID_URL, url)
3694 self._downloader.report_error(u'invalid URL: %s' % url)
3697 video_id = mobj.group('id')
3698 webpage = self._download_webpage(url, video_id)
3700 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3702 self._downloader.report_error(u'unable to find video information')
3703 video_url = unescapeHTML(m.group('url'))
3705 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3707 self._downloader.trouble(u'Cannot find video title')
3708 title = clean_html(m.group('title'))
3710 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3712 desc = unescapeHTML(m.group('desc'))
3721 'description': desc,
3725 class SteamIE(InfoExtractor):
3726 _VALID_URL = r"""http://store.steampowered.com/
3727 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3729 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3733 def suitable(cls, url):
3734 """Receives a URL and returns True if suitable for this IE."""
3735 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3737 def _real_extract(self, url):
3738 m = re.match(self._VALID_URL, url, re.VERBOSE)
3739 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3740 gameID = m.group('gameID')
3741 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3742 webpage = self._download_webpage(videourl, gameID)
3743 mweb = re.finditer(urlRE, webpage)
3744 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3745 titles = re.finditer(namesRE, webpage)
3746 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3747 thumbs = re.finditer(thumbsRE, webpage)
3749 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3750 video_id = vid.group('videoID')
3751 title = vtitle.group('videoName')
3752 video_url = vid.group('videoURL')
3753 video_thumb = thumb.group('thumbnail')
3755 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3760 'title': unescapeHTML(title),
3761 'thumbnail': video_thumb
3766 class UstreamIE(InfoExtractor):
3767 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3768 IE_NAME = u'ustream'
3770 def _real_extract(self, url):
3771 m = re.match(self._VALID_URL, url)
3772 video_id = m.group('videoID')
3773 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3774 webpage = self._download_webpage(url, video_id)
3775 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3776 title = m.group('title')
3777 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3778 uploader = m.group('uploader')
3784 'uploader': uploader
3788 class WorldStarHipHopIE(InfoExtractor):
3789 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3790 IE_NAME = u'WorldStarHipHop'
3792 def _real_extract(self, url):
3793 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3795 webpage_src = compat_urllib_request.urlopen(url).read()
3796 webpage_src = webpage_src.decode('utf-8')
3798 mobj = re.search(_src_url, webpage_src)
3800 m = re.match(self._VALID_URL, url)
3801 video_id = m.group('id')
3803 if mobj is not None:
3804 video_url = mobj.group()
3805 if 'mp4' in video_url:
3810 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3813 _title = r"""<title>(.*)</title>"""
3815 mobj = re.search(_title, webpage_src)
3817 if mobj is not None:
3818 title = mobj.group(1)
3820 title = 'World Start Hip Hop - %s' % time.ctime()
3822 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3823 mobj = re.search(_thumbnail, webpage_src)
3825 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3826 if mobj is not None:
3827 thumbnail = mobj.group(1)
3829 _title = r"""candytitles.*>(.*)</span>"""
3830 mobj = re.search(_title, webpage_src)
3831 if mobj is not None:
3832 title = mobj.group(1)
3839 'thumbnail' : thumbnail,
3844 class RBMARadioIE(InfoExtractor):
3845 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3847 def _real_extract(self, url):
3848 m = re.match(self._VALID_URL, url)
3849 video_id = m.group('videoID')
3851 webpage = self._download_webpage(url, video_id)
3852 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3854 raise ExtractorError(u'Cannot find metadata')
3855 json_data = m.group(1)
3858 data = json.loads(json_data)
3859 except ValueError as e:
3860 raise ExtractorError(u'Invalid JSON: ' + str(e))
3862 video_url = data['akamai_url'] + '&cbr=256'
3863 url_parts = compat_urllib_parse_urlparse(video_url)
3864 video_ext = url_parts.path.rpartition('.')[2]
3869 'title': data['title'],
3870 'description': data.get('teaser_text'),
3871 'location': data.get('country_of_origin'),
3872 'uploader': data.get('host', {}).get('name'),
3873 'uploader_id': data.get('host', {}).get('slug'),
3874 'thumbnail': data.get('image', {}).get('large_url_2x'),
3875 'duration': data.get('duration'),
3880 class YouPornIE(InfoExtractor):
3881 """Information extractor for youporn.com."""
3882 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3884 def _print_formats(self, formats):
3885 """Print all available formats"""
3886 print(u'Available formats:')
3887 print(u'ext\t\tformat')
3888 print(u'---------------------------------')
3889 for format in formats:
3890 print(u'%s\t\t%s' % (format['ext'], format['format']))
3892 def _specific(self, req_format, formats):
3894 if(x["format"]==req_format):
3898 def _real_extract(self, url):
3899 mobj = re.match(self._VALID_URL, url)
3901 self._downloader.report_error(u'invalid URL: %s' % url)
3904 video_id = mobj.group('videoid')
3906 req = compat_urllib_request.Request(url)
3907 req.add_header('Cookie', 'age_verified=1')
3908 webpage = self._download_webpage(req, video_id)
3910 # Get the video title
3911 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3913 raise ExtractorError(u'Unable to extract video title')
3914 video_title = result.group('title').strip()
3916 # Get the video date
3917 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3919 self._downloader.report_warning(u'unable to extract video date')
3922 upload_date = result.group('date').strip()
3924 # Get the video uploader
3925 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3927 self._downloader.report_warning(u'unable to extract uploader')
3928 video_uploader = None
3930 video_uploader = result.group('uploader').strip()
3931 video_uploader = clean_html( video_uploader )
3933 # Get all of the formats available
3934 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3935 result = re.search(DOWNLOAD_LIST_RE, webpage)
3937 raise ExtractorError(u'Unable to extract download list')
3938 download_list_html = result.group('download_list').strip()
3940 # Get all of the links from the page
3941 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3942 links = re.findall(LINK_RE, download_list_html)
3943 if(len(links) == 0):
3944 raise ExtractorError(u'ERROR: no known formats available for video')
3946 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3951 # A link looks like this:
3952 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3953 # A path looks like this:
3954 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3955 video_url = unescapeHTML( link )
3956 path = compat_urllib_parse_urlparse( video_url ).path
3957 extension = os.path.splitext( path )[1][1:]
3958 format = path.split('/')[4].split('_')[:2]
3961 format = "-".join( format )
3962 title = u'%s-%s-%s' % (video_title, size, bitrate)
3967 'uploader': video_uploader,
3968 'upload_date': upload_date,
3973 'description': None,
3977 if self._downloader.params.get('listformats', None):
3978 self._print_formats(formats)
3981 req_format = self._downloader.params.get('format', None)
3982 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3984 if req_format is None or req_format == 'best':
3986 elif req_format == 'worst':
3987 return [formats[-1]]
3988 elif req_format in ('-1', 'all'):
3991 format = self._specific( req_format, formats )
3993 self._downloader.report_error(u'requested format not available')
3999 class PornotubeIE(InfoExtractor):
4000 """Information extractor for pornotube.com."""
4001 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4003 def _real_extract(self, url):
4004 mobj = re.match(self._VALID_URL, url)
4006 self._downloader.report_error(u'invalid URL: %s' % url)
4009 video_id = mobj.group('videoid')
4010 video_title = mobj.group('title')
4012 # Get webpage content
4013 webpage = self._download_webpage(url, video_id)
4016 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4017 result = re.search(VIDEO_URL_RE, webpage)
4019 self._downloader.report_error(u'unable to extract video url')
4021 video_url = compat_urllib_parse.unquote(result.group('url'))
4023 #Get the uploaded date
4024 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4025 result = re.search(VIDEO_UPLOADED_RE, webpage)
4027 self._downloader.report_error(u'unable to extract video title')
4029 upload_date = result.group('date')
4031 info = {'id': video_id,
4034 'upload_date': upload_date,
4035 'title': video_title,
4041 class YouJizzIE(InfoExtractor):
4042 """Information extractor for youjizz.com."""
4043 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4045 def _real_extract(self, url):
4046 mobj = re.match(self._VALID_URL, url)
4048 self._downloader.report_error(u'invalid URL: %s' % url)
4051 video_id = mobj.group('videoid')
4053 # Get webpage content
4054 webpage = self._download_webpage(url, video_id)
4056 # Get the video title
4057 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4059 raise ExtractorError(u'ERROR: unable to extract video title')
4060 video_title = result.group('title').strip()
4062 # Get the embed page
4063 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4065 raise ExtractorError(u'ERROR: unable to extract embed page')
4067 embed_page_url = result.group(0).strip()
4068 video_id = result.group('videoid')
4070 webpage = self._download_webpage(embed_page_url, video_id)
4073 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4075 raise ExtractorError(u'ERROR: unable to extract video url')
4076 video_url = result.group('source')
4078 info = {'id': video_id,
4080 'title': video_title,
4083 'player_url': embed_page_url}
4087 class EightTracksIE(InfoExtractor):
4089 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4091 def _real_extract(self, url):
4092 mobj = re.match(self._VALID_URL, url)
4094 raise ExtractorError(u'Invalid URL: %s' % url)
4095 playlist_id = mobj.group('id')
4097 webpage = self._download_webpage(url, playlist_id)
4099 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4101 raise ExtractorError(u'Cannot find trax information')
4102 json_like = m.group(1)
4103 data = json.loads(json_like)
4105 session = str(random.randint(0, 1000000000))
4107 track_count = data['tracks_count']
4108 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4109 next_url = first_url
4111 for i in itertools.count():
4112 api_json = self._download_webpage(next_url, playlist_id,
4113 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4114 errnote=u'Failed to download song information')
4115 api_data = json.loads(api_json)
4116 track_data = api_data[u'set']['track']
4118 'id': track_data['id'],
4119 'url': track_data['track_file_stream_url'],
4120 'title': track_data['performer'] + u' - ' + track_data['name'],
4121 'raw_title': track_data['name'],
4122 'uploader_id': data['user']['login'],
4126 if api_data['set']['at_last_track']:
4128 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4131 class KeekIE(InfoExtractor):
4132 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4135 def _real_extract(self, url):
4136 m = re.match(self._VALID_URL, url)
4137 video_id = m.group('videoID')
4138 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4139 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4140 webpage = self._download_webpage(url, video_id)
4141 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4142 title = unescapeHTML(m.group('title'))
4143 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4144 uploader = clean_html(m.group('uploader'))
4150 'thumbnail': thumbnail,
4151 'uploader': uploader
4155 class TEDIE(InfoExtractor):
4156 _VALID_URL=r'''http://www.ted.com/
4158 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4160 ((?P<type_talk>talks)) # We have a simple talk
4162 /(?P<name>\w+) # Here goes the name and then ".html"
4166 def suitable(cls, url):
4167 """Receives a URL and returns True if suitable for this IE."""
4168 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4170 def _real_extract(self, url):
4171 m=re.match(self._VALID_URL, url, re.VERBOSE)
4172 if m.group('type_talk'):
4173 return [self._talk_info(url)]
4175 playlist_id=m.group('playlist_id')
4176 name=m.group('name')
4177 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4178 return self._playlist_videos_info(url,name,playlist_id)
4180 def _talk_video_link(self,mediaSlug):
4181 '''Returns the video link for that mediaSlug'''
4182 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4184 def _playlist_videos_info(self,url,name,playlist_id=0):
4185 '''Returns the videos of the playlist'''
4187 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4188 ([.\s]*?)data-playlist_item_id="(\d+)"
4189 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4191 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4192 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4193 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4194 m_names=re.finditer(video_name_RE,webpage)
4196 for m_video, m_name in zip(m_videos,m_names):
4197 video_id=m_video.group('video_id')
4198 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4199 info.append(self._talk_info(talk_url,video_id))
4202 def _talk_info(self, url, video_id=0):
4203 """Return the video for the talk in the url"""
4204 m=re.match(self._VALID_URL, url,re.VERBOSE)
4205 videoName=m.group('name')
4206 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4207 # If the url includes the language we get the title translated
4208 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4209 title=re.search(title_RE, webpage).group('title')
4210 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4211 "id":(?P<videoID>[\d]+).*?
4212 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4213 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4214 thumb_match=re.search(thumb_RE,webpage)
4215 info_match=re.search(info_RE,webpage,re.VERBOSE)
4216 video_id=info_match.group('videoID')
4217 mediaSlug=info_match.group('mediaSlug')
4218 video_url=self._talk_video_link(mediaSlug)
4224 'thumbnail': thumb_match.group('thumbnail')
4228 class MySpassIE(InfoExtractor):
4229 _VALID_URL = r'http://www.myspass.de/.*'
4231 def _real_extract(self, url):
4232 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4234 # video id is the last path element of the URL
4235 # usually there is a trailing slash, so also try the second but last
4236 url_path = compat_urllib_parse_urlparse(url).path
4237 url_parent_path, video_id = os.path.split(url_path)
4239 _, video_id = os.path.split(url_parent_path)
4242 metadata_url = META_DATA_URL_TEMPLATE % video_id
4243 metadata_text = self._download_webpage(metadata_url, video_id)
4244 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4246 # extract values from metadata
4247 url_flv_el = metadata.find('url_flv')
4248 if url_flv_el is None:
4249 self._downloader.report_error(u'unable to extract download url')
4251 video_url = url_flv_el.text
4252 extension = os.path.splitext(video_url)[1][1:]
4253 title_el = metadata.find('title')
4254 if title_el is None:
4255 self._downloader.report_error(u'unable to extract title')
4257 title = title_el.text
4258 format_id_el = metadata.find('format_id')
4259 if format_id_el is None:
4262 format = format_id_el.text
4263 description_el = metadata.find('description')
4264 if description_el is not None:
4265 description = description_el.text
4268 imagePreview_el = metadata.find('imagePreview')
4269 if imagePreview_el is not None:
4270 thumbnail = imagePreview_el.text
4279 'thumbnail': thumbnail,
4280 'description': description
4284 class SpiegelIE(InfoExtractor):
4285 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4287 def _real_extract(self, url):
4288 m = re.match(self._VALID_URL, url)
4289 video_id = m.group('videoID')
4291 webpage = self._download_webpage(url, video_id)
4292 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4294 raise ExtractorError(u'Cannot find title')
4295 video_title = unescapeHTML(m.group(1))
4297 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4298 xml_code = self._download_webpage(xml_url, video_id,
4299 note=u'Downloading XML', errnote=u'Failed to download XML')
4301 idoc = xml.etree.ElementTree.fromstring(xml_code)
4302 last_type = idoc[-1]
4303 filename = last_type.findall('./filename')[0].text
4304 duration = float(last_type.findall('./duration')[0].text)
4306 video_url = 'http://video2.spiegel.de/flash/' + filename
4307 video_ext = filename.rpartition('.')[2]
4312 'title': video_title,
4313 'duration': duration,
4317 class LiveLeakIE(InfoExtractor):
4319 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4320 IE_NAME = u'liveleak'
4322 def _real_extract(self, url):
4323 mobj = re.match(self._VALID_URL, url)
4325 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4328 video_id = mobj.group('video_id')
4330 webpage = self._download_webpage(url, video_id)
4332 m = re.search(r'file: "(.*?)",', webpage)
4334 self._downloader.report_error(u'unable to find video url')
4336 video_url = m.group(1)
4338 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4340 self._downloader.trouble(u'Cannot find video title')
4341 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4343 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4345 desc = unescapeHTML(m.group('desc'))
4349 m = re.search(r'By:.*?(\w+)</a>', webpage)
4351 uploader = clean_html(m.group(1))
4360 'description': desc,
4361 'uploader': uploader
4366 class ARDIE(InfoExtractor):
4367 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4368 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4369 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4371 def _real_extract(self, url):
4372 # determine video id from url
4373 m = re.match(self._VALID_URL, url)
4375 numid = re.search(r'documentId=([0-9]+)', url)
4377 video_id = numid.group(1)
4379 video_id = m.group('video_id')
4381 # determine title and media streams from webpage
4382 html = self._download_webpage(url, video_id)
4383 title = re.search(self._TITLE, html).group('title')
4384 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4386 assert '"fsk"' in html
4387 self._downloader.report_error(u'this video is only available after 8:00 pm')
4390 # choose default media type and highest quality for now
4391 stream = max([s for s in streams if int(s["media_type"]) == 0],
4392 key=lambda s: int(s["quality"]))
4394 # there's two possibilities: RTMP stream or HTTP download
4395 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4396 if stream['rtmp_url']:
4397 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4398 assert stream['video_url'].startswith('mp4:')
4399 info["url"] = stream["rtmp_url"]
4400 info["play_path"] = stream['video_url']
4402 assert stream["video_url"].endswith('.mp4')
4403 info["url"] = stream["video_url"]
4407 def gen_extractors():
4408 """ Return a list of an instance of every supported extractor.
4409 The order does matter; the first extractor matched is the one handling the URL.
4412 YoutubePlaylistIE(),
4437 StanfordOpenClassroomIE(),
4447 WorldStarHipHopIE(),