2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
148 class YoutubeIE(InfoExtractor):
149 """Information extractor for youtube.com."""
153 (?:https?://)? # http(s):// (optional)
154 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
155 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
156 (?:.*?\#/)? # handle anchor (#/) redirect urls
157 (?: # the various things that can precede the ID:
158 (?:(?:v|embed|e)/) # v/ or embed/ or e/
159 |(?: # or the v= param in all its forms
160 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
161 (?:\?|\#!?) # the params delimiter ? or # or #!
162 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
165 )? # optional -> youtube.com/xxxx is OK
166 )? # all until now is optional -> you can pass the naked ID
167 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
168 (?(1).+)? # if we found the ID, everything can follow
170 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
171 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
172 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
173 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
174 _NETRC_MACHINE = 'youtube'
175 # Listed in order of quality
176 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
178 _video_extensions = {
184 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
190 _video_dimensions = {
209 def suitable(cls, url):
210 """Receives a URL and returns True if suitable for this IE."""
211 if YoutubePlaylistIE.suitable(url): return False
212 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
214 def report_lang(self):
215 """Report attempt to set language."""
216 self._downloader.to_screen(u'[youtube] Setting language')
218 def report_login(self):
219 """Report attempt to log in."""
220 self._downloader.to_screen(u'[youtube] Logging in')
222 def report_age_confirmation(self):
223 """Report attempt to confirm age."""
224 self._downloader.to_screen(u'[youtube] Confirming age')
226 def report_video_webpage_download(self, video_id):
227 """Report attempt to download video webpage."""
228 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
230 def report_video_info_webpage_download(self, video_id):
231 """Report attempt to download video info webpage."""
232 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
234 def report_video_subtitles_download(self, video_id):
235 """Report attempt to download video info webpage."""
236 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
238 def report_video_subtitles_request(self, video_id, sub_lang, format):
239 """Report attempt to download video info webpage."""
240 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
242 def report_video_subtitles_available(self, video_id, sub_lang_list):
243 """Report available subtitles."""
244 sub_lang = ",".join(list(sub_lang_list.keys()))
245 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
247 def report_information_extraction(self, video_id):
248 """Report attempt to extract video information."""
249 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
251 def report_unavailable_format(self, video_id, format):
252 """Report extracted video URL."""
253 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
255 def report_rtmp_download(self):
256 """Indicate the download will use the RTMP protocol."""
257 self._downloader.to_screen(u'[youtube] RTMP download detected')
259 def _get_available_subtitles(self, video_id):
260 self.report_video_subtitles_download(video_id)
261 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
263 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
265 return (u'unable to download video subtitles: %s' % compat_str(err), None)
266 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
267 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
268 if not sub_lang_list:
269 return (u'video doesn\'t have subtitles', None)
272 def _list_available_subtitles(self, video_id):
273 sub_lang_list = self._get_available_subtitles(video_id)
274 self.report_video_subtitles_available(video_id, sub_lang_list)
276 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
279 (error_message, sub_lang, sub)
281 self.report_video_subtitles_request(video_id, sub_lang, format)
282 params = compat_urllib_parse.urlencode({
288 url = 'http://www.youtube.com/api/timedtext?' + params
290 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
294 return (u'Did not fetch video subtitles', None, None)
295 return (None, sub_lang, sub)
297 def _extract_subtitle(self, video_id):
299 Return a list with a tuple:
300 [(error_message, sub_lang, sub)]
302 sub_lang_list = self._get_available_subtitles(video_id)
303 sub_format = self._downloader.params.get('subtitlesformat')
304 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
305 return [(sub_lang_list[0], None, None)]
306 if self._downloader.params.get('subtitleslang', False):
307 sub_lang = self._downloader.params.get('subtitleslang')
308 elif 'en' in sub_lang_list:
311 sub_lang = list(sub_lang_list.keys())[0]
312 if not sub_lang in sub_lang_list:
313 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
315 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
318 def _extract_all_subtitles(self, video_id):
319 sub_lang_list = self._get_available_subtitles(video_id)
320 sub_format = self._downloader.params.get('subtitlesformat')
321 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
322 return [(sub_lang_list[0], None, None)]
324 for sub_lang in sub_lang_list:
325 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
326 subtitles.append(subtitle)
329 def _print_formats(self, formats):
330 print('Available formats:')
332 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
334 def _real_initialize(self):
335 if self._downloader is None:
340 downloader_params = self._downloader.params
342 # Attempt to use provided username and password or .netrc data
343 if downloader_params.get('username', None) is not None:
344 username = downloader_params['username']
345 password = downloader_params['password']
346 elif downloader_params.get('usenetrc', False):
348 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
353 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
354 except (IOError, netrc.NetrcParseError) as err:
355 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
359 request = compat_urllib_request.Request(self._LANG_URL)
362 compat_urllib_request.urlopen(request).read()
363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
367 # No authentication to be performed
371 request = compat_urllib_request.Request(self._LOGIN_URL)
373 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
380 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
382 galx = match.group(1)
384 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
390 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
394 u'PersistentCookie': u'yes',
396 u'bgresponse': u'js_disabled',
397 u'checkConnection': u'',
398 u'checkedDomains': u'youtube',
404 u'signIn': u'Sign in',
406 u'service': u'youtube',
410 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
412 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
413 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
414 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
417 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
418 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
419 self._downloader.report_warning(u'unable to log in: bad username or password')
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
428 'action_confirm': 'Confirm',
430 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
432 self.report_age_confirmation()
433 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
435 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
438 def _extract_id(self, url):
439 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
441 self._downloader.report_error(u'invalid URL: %s' % url)
443 video_id = mobj.group(2)
446 def _real_extract(self, url):
447 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
448 mobj = re.search(self._NEXT_URL_RE, url)
450 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
451 video_id = self._extract_id(url)
454 self.report_video_webpage_download(video_id)
455 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
456 request = compat_urllib_request.Request(url)
458 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
460 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
463 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
465 # Attempt to extract SWF player URL
466 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
468 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
473 self.report_video_info_webpage_download(video_id)
474 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
475 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
476 % (video_id, el_type))
477 video_info_webpage = self._download_webpage(video_info_url, video_id,
479 errnote='unable to download video info webpage')
480 video_info = compat_parse_qs(video_info_webpage)
481 if 'token' in video_info:
483 if 'token' not in video_info:
484 if 'reason' in video_info:
485 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
487 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
490 # Check for "rental" videos
491 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
492 self._downloader.report_error(u'"rental" videos not supported')
495 # Start extracting information
496 self.report_information_extraction(video_id)
499 if 'author' not in video_info:
500 self._downloader.report_error(u'unable to extract uploader name')
502 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
505 video_uploader_id = None
506 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
508 video_uploader_id = mobj.group(1)
510 self._downloader.report_warning(u'unable to extract uploader nickname')
513 if 'title' not in video_info:
514 self._downloader.report_error(u'unable to extract video title')
516 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
519 if 'thumbnail_url' not in video_info:
520 self._downloader.report_warning(u'unable to extract video thumbnail')
522 else: # don't panic if we can't find it
523 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
527 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
529 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
530 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
531 for expression in format_expressions:
533 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
538 video_description = get_element_by_id("eow-description", video_webpage)
539 if video_description:
540 video_description = clean_html(video_description)
542 video_description = ''
545 video_subtitles = None
547 if self._downloader.params.get('writesubtitles', False):
548 video_subtitles = self._extract_subtitle(video_id)
550 (sub_error, sub_lang, sub) = video_subtitles[0]
552 self._downloader.report_error(sub_error)
554 if self._downloader.params.get('allsubtitles', False):
555 video_subtitles = self._extract_all_subtitles(video_id)
556 for video_subtitle in video_subtitles:
557 (sub_error, sub_lang, sub) = video_subtitle
559 self._downloader.report_error(sub_error)
561 if self._downloader.params.get('listsubtitles', False):
562 sub_lang_list = self._list_available_subtitles(video_id)
565 if 'length_seconds' not in video_info:
566 self._downloader.report_warning(u'unable to extract video duration')
569 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
572 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
574 # Decide which formats to download
575 req_format = self._downloader.params.get('format', None)
577 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
578 self.report_rtmp_download()
579 video_url_list = [(None, video_info['conn'][0])]
580 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
581 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
582 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
583 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
584 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
586 format_limit = self._downloader.params.get('format_limit', None)
587 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
588 if format_limit is not None and format_limit in available_formats:
589 format_list = available_formats[available_formats.index(format_limit):]
591 format_list = available_formats
592 existing_formats = [x for x in format_list if x in url_map]
593 if len(existing_formats) == 0:
594 self._downloader.report_error(u'no known formats available for video')
596 if self._downloader.params.get('listformats', None):
597 self._print_formats(existing_formats)
599 if req_format is None or req_format == 'best':
600 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
601 elif req_format == 'worst':
602 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
603 elif req_format in ('-1', 'all'):
604 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
606 # Specific formats. We pick the first in a slash-delimeted sequence.
607 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
608 req_formats = req_format.split('/')
609 video_url_list = None
610 for rf in req_formats:
612 video_url_list = [(rf, url_map[rf])]
614 if video_url_list is None:
615 self._downloader.report_error(u'requested format not available')
618 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
622 for format_param, video_real_url in video_url_list:
624 video_extension = self._video_extensions.get(format_param, 'flv')
626 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
627 self._video_dimensions.get(format_param, '???'))
631 'url': video_real_url,
632 'uploader': video_uploader,
633 'uploader_id': video_uploader_id,
634 'upload_date': upload_date,
635 'title': video_title,
636 'ext': video_extension,
637 'format': video_format,
638 'thumbnail': video_thumbnail,
639 'description': video_description,
640 'player_url': player_url,
641 'subtitles': video_subtitles,
642 'duration': video_duration
647 class MetacafeIE(InfoExtractor):
648 """Information Extractor for metacafe.com."""
650 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
651 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
652 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
653 IE_NAME = u'metacafe'
655 def __init__(self, downloader=None):
656 InfoExtractor.__init__(self, downloader)
658 def report_disclaimer(self):
659 """Report disclaimer retrieval."""
660 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
662 def report_age_confirmation(self):
663 """Report attempt to confirm age."""
664 self._downloader.to_screen(u'[metacafe] Confirming age')
666 def report_download_webpage(self, video_id):
667 """Report webpage download."""
668 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
670 def report_extraction(self, video_id):
671 """Report information extraction."""
672 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
674 def _real_initialize(self):
675 # Retrieve disclaimer
676 request = compat_urllib_request.Request(self._DISCLAIMER)
678 self.report_disclaimer()
679 disclaimer = compat_urllib_request.urlopen(request).read()
680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
681 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
687 'submit': "Continue - I'm over 18",
689 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
691 self.report_age_confirmation()
692 disclaimer = compat_urllib_request.urlopen(request).read()
693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
694 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
697 def _real_extract(self, url):
698 # Extract id and simplified title from URL
699 mobj = re.match(self._VALID_URL, url)
701 self._downloader.report_error(u'invalid URL: %s' % url)
704 video_id = mobj.group(1)
706 # Check if video comes from YouTube
707 mobj2 = re.match(r'^yt-(.*)$', video_id)
708 if mobj2 is not None:
709 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
712 # Retrieve video webpage to extract further information
713 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
715 self.report_download_webpage(video_id)
716 webpage = compat_urllib_request.urlopen(request).read()
717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
721 # Extract URL, uploader and title from webpage
722 self.report_extraction(video_id)
723 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
725 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
726 video_extension = mediaURL[-3:]
728 # Extract gdaKey if available
729 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
733 gdaKey = mobj.group(1)
734 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
736 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
738 self._downloader.report_error(u'unable to extract media URL')
740 vardict = compat_parse_qs(mobj.group(1))
741 if 'mediaData' not in vardict:
742 self._downloader.report_error(u'unable to extract media URL')
744 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
746 self._downloader.report_error(u'unable to extract media URL')
748 mediaURL = mobj.group(1).replace('\\/', '/')
749 video_extension = mediaURL[-3:]
750 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
752 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
754 self._downloader.report_error(u'unable to extract title')
756 video_title = mobj.group(1).decode('utf-8')
758 mobj = re.search(r'submitter=(.*?);', webpage)
760 self._downloader.report_error(u'unable to extract uploader nickname')
762 video_uploader = mobj.group(1)
765 'id': video_id.decode('utf-8'),
766 'url': video_url.decode('utf-8'),
767 'uploader': video_uploader.decode('utf-8'),
769 'title': video_title,
770 'ext': video_extension.decode('utf-8'),
774 class DailymotionIE(InfoExtractor):
775 """Information Extractor for Dailymotion"""
777 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
778 IE_NAME = u'dailymotion'
781 def __init__(self, downloader=None):
782 InfoExtractor.__init__(self, downloader)
784 def report_extraction(self, video_id):
785 """Report information extraction."""
786 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
788 def _real_extract(self, url):
789 # Extract id and simplified title from URL
790 mobj = re.match(self._VALID_URL, url)
792 self._downloader.report_error(u'invalid URL: %s' % url)
795 video_id = mobj.group(1).split('_')[0].split('?')[0]
797 video_extension = 'mp4'
799 # Retrieve video webpage to extract further information
800 request = compat_urllib_request.Request(url)
801 request.add_header('Cookie', 'family_filter=off')
802 webpage = self._download_webpage(request, video_id)
804 # Extract URL, uploader and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
808 self._downloader.report_error(u'unable to extract media URL')
810 flashvars = compat_urllib_parse.unquote(mobj.group(1))
812 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
815 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
818 self._downloader.report_error(u'unable to extract video URL')
821 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
823 self._downloader.report_error(u'unable to extract video URL')
826 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
828 # TODO: support choosing qualities
830 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
832 self._downloader.report_error(u'unable to extract title')
834 video_title = unescapeHTML(mobj.group('title'))
836 video_uploader = None
837 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
839 # lookin for official user
840 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841 if mobj_official is None:
842 self._downloader.report_warning(u'unable to extract uploader nickname')
844 video_uploader = mobj_official.group(1)
846 video_uploader = mobj.group(1)
848 video_upload_date = None
849 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
851 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
856 'uploader': video_uploader,
857 'upload_date': video_upload_date,
858 'title': video_title,
859 'ext': video_extension,
863 class PhotobucketIE(InfoExtractor):
864 """Information extractor for photobucket.com."""
866 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867 IE_NAME = u'photobucket'
869 def __init__(self, downloader=None):
870 InfoExtractor.__init__(self, downloader)
872 def report_download_webpage(self, video_id):
873 """Report webpage download."""
874 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
876 def report_extraction(self, video_id):
877 """Report information extraction."""
878 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
880 def _real_extract(self, url):
881 # Extract id from URL
882 mobj = re.match(self._VALID_URL, url)
884 self._downloader.report_error(u'Invalid URL: %s' % url)
887 video_id = mobj.group(1)
889 video_extension = 'flv'
891 # Retrieve video webpage to extract further information
892 request = compat_urllib_request.Request(url)
894 self.report_download_webpage(video_id)
895 webpage = compat_urllib_request.urlopen(request).read()
896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
900 # Extract URL, uploader, and title from webpage
901 self.report_extraction(video_id)
902 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
904 self._downloader.report_error(u'unable to extract media URL')
906 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
910 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
912 self._downloader.report_error(u'unable to extract title')
914 video_title = mobj.group(1).decode('utf-8')
916 video_uploader = mobj.group(2).decode('utf-8')
919 'id': video_id.decode('utf-8'),
920 'url': video_url.decode('utf-8'),
921 'uploader': video_uploader,
923 'title': video_title,
924 'ext': video_extension.decode('utf-8'),
928 class YahooIE(InfoExtractor):
929 """Information extractor for video.yahoo.com."""
932 # _VALID_URL matches all Yahoo! Video URLs
933 # _VPAGE_URL matches only the extractable '/watch/' URLs
934 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
935 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
936 IE_NAME = u'video.yahoo'
938 def __init__(self, downloader=None):
939 InfoExtractor.__init__(self, downloader)
941 def report_download_webpage(self, video_id):
942 """Report webpage download."""
943 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
945 def report_extraction(self, video_id):
946 """Report information extraction."""
947 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
949 def _real_extract(self, url, new_video=True):
950 # Extract ID from URL
951 mobj = re.match(self._VALID_URL, url)
953 self._downloader.report_error(u'Invalid URL: %s' % url)
956 video_id = mobj.group(2)
957 video_extension = 'flv'
959 # Rewrite valid but non-extractable URLs as
960 # extractable English language /watch/ URLs
961 if re.match(self._VPAGE_URL, url) is None:
962 request = compat_urllib_request.Request(url)
964 webpage = compat_urllib_request.urlopen(request).read()
965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
966 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
969 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
971 self._downloader.report_error(u'Unable to extract id field')
973 yahoo_id = mobj.group(1)
975 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
977 self._downloader.report_error(u'Unable to extract vid field')
979 yahoo_vid = mobj.group(1)
981 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
982 return self._real_extract(url, new_video=False)
984 # Retrieve video webpage to extract further information
985 request = compat_urllib_request.Request(url)
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract uploader and title from webpage
994 self.report_extraction(video_id)
995 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
997 self._downloader.report_error(u'unable to extract video title')
999 video_title = mobj.group(1).decode('utf-8')
1001 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1003 self._downloader.report_error(u'unable to extract video uploader')
1005 video_uploader = mobj.group(1).decode('utf-8')
1007 # Extract video thumbnail
1008 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1010 self._downloader.report_error(u'unable to extract video thumbnail')
1012 video_thumbnail = mobj.group(1).decode('utf-8')
1014 # Extract video description
1015 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video description')
1019 video_description = mobj.group(1).decode('utf-8')
1020 if not video_description:
1021 video_description = 'No description available.'
1023 # Extract video height and width
1024 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video height')
1028 yv_video_height = mobj.group(1)
1030 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1032 self._downloader.report_error(u'unable to extract video width')
1034 yv_video_width = mobj.group(1)
1036 # Retrieve video playlist to extract media URL
1037 # I'm not completely sure what all these options are, but we
1038 # seem to need most of them, otherwise the server sends a 401.
1039 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1040 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1041 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1045 self.report_download_webpage(video_id)
1046 webpage = compat_urllib_request.urlopen(request).read()
1047 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1051 # Extract media URL from playlist XML
1052 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1054 self._downloader.report_error(u'Unable to extract media URL')
1056 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057 video_url = unescapeHTML(video_url)
1060 'id': video_id.decode('utf-8'),
1062 'uploader': video_uploader,
1063 'upload_date': None,
1064 'title': video_title,
1065 'ext': video_extension.decode('utf-8'),
1066 'thumbnail': video_thumbnail.decode('utf-8'),
1067 'description': video_description,
1071 class VimeoIE(InfoExtractor):
1072 """Information extractor for vimeo.com."""
1074 # _VALID_URL matches Vimeo URLs
1075 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1089 def _real_extract(self, url, new_video=True):
1090 # Extract ID from URL
1091 mobj = re.match(self._VALID_URL, url)
1093 self._downloader.report_error(u'Invalid URL: %s' % url)
1096 video_id = mobj.group('id')
1097 if not mobj.group('proto'):
1098 url = 'https://' + url
1099 if mobj.group('direct_link'):
1100 url = 'https://vimeo.com/' + video_id
1102 # Retrieve video webpage to extract further information
1103 request = compat_urllib_request.Request(url, None, std_headers)
1105 self.report_download_webpage(video_id)
1106 webpage_bytes = compat_urllib_request.urlopen(request).read()
1107 webpage = webpage_bytes.decode('utf-8')
1108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1117 # Extract the config JSON
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1122 self._downloader.report_error(u'unable to extract info section')
1126 video_title = config["video"]["title"]
1128 # Extract uploader and uploader_id
1129 video_uploader = config["video"]["owner"]["name"]
1130 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1132 # Extract video thumbnail
1133 video_thumbnail = config["video"]["thumbnail"]
1135 # Extract video description
1136 video_description = get_element_by_attribute("itemprop", "description", webpage)
1137 if video_description: video_description = clean_html(video_description)
1138 else: video_description = u''
1140 # Extract upload date
1141 video_upload_date = None
1142 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143 if mobj is not None:
1144 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1146 # Vimeo specific: extract request signature and timestamp
1147 sig = config['request']['signature']
1148 timestamp = config['request']['timestamp']
1150 # Vimeo specific: extract video codec and quality information
1151 # First consider quality, then codecs, then take everything
1152 # TODO bind to format param
1153 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154 files = { 'hd': [], 'sd': [], 'other': []}
1155 for codec_name, codec_extension in codecs:
1156 if codec_name in config["video"]["files"]:
1157 if 'hd' in config["video"]["files"][codec_name]:
1158 files['hd'].append((codec_name, codec_extension, 'hd'))
1159 elif 'sd' in config["video"]["files"][codec_name]:
1160 files['sd'].append((codec_name, codec_extension, 'sd'))
1162 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1164 for quality in ('hd', 'sd', 'other'):
1165 if len(files[quality]) > 0:
1166 video_quality = files[quality][0][2]
1167 video_codec = files[quality][0][0]
1168 video_extension = files[quality][0][1]
1169 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172 self._downloader.report_error(u'no known codec found')
1175 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1181 'uploader': video_uploader,
1182 'uploader_id': video_uploader_id,
1183 'upload_date': video_upload_date,
1184 'title': video_title,
1185 'ext': video_extension,
1186 'thumbnail': video_thumbnail,
1187 'description': video_description,
1191 class ArteTvIE(InfoExtractor):
1192 """arte.tv information extractor."""
1194 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195 _LIVE_URL = r'index-[0-9]+\.html$'
1197 IE_NAME = u'arte.tv'
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1206 def report_extraction(self, video_id):
1207 """Report information extraction."""
1208 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1210 def fetch_webpage(self, url):
1211 request = compat_urllib_request.Request(url)
1213 self.report_download_webpage(url)
1214 webpage = compat_urllib_request.urlopen(request).read()
1215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1218 except ValueError as err:
1219 self._downloader.report_error(u'Invalid URL: %s' % url)
1223 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224 page = self.fetch_webpage(url)
1225 mobj = re.search(regex, page, regexFlags)
1229 self._downloader.report_error(u'Invalid URL: %s' % url)
1232 for (i, key, err) in matchTuples:
1233 if mobj.group(i) is None:
1234 self._downloader.trouble(err)
1237 info[key] = mobj.group(i)
1241 def extractLiveStream(self, url):
1242 video_lang = url.split('/')[-4]
1243 info = self.grep_webpage(
1245 r'src="(.*?/videothek_js.*?\.js)',
1248 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1251 http_host = url.split('/')[2]
1252 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253 info = self.grep_webpage(
1255 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256 '(http://.*?\.swf).*?' +
1260 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1261 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1265 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1267 def extractPlus7Stream(self, url):
1268 video_lang = url.split('/')[-3]
1269 info = self.grep_webpage(
1271 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1274 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1277 next_url = compat_urllib_parse.unquote(info.get('url'))
1278 info = self.grep_webpage(
1280 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1283 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1286 next_url = compat_urllib_parse.unquote(info.get('url'))
1288 info = self.grep_webpage(
1290 r'<video id="(.*?)".*?>.*?' +
1291 '<name>(.*?)</name>.*?' +
1292 '<dateVideo>(.*?)</dateVideo>.*?' +
1293 '<url quality="hd">(.*?)</url>',
1296 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1297 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1299 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1304 'id': info.get('id'),
1305 'url': compat_urllib_parse.unquote(info.get('url')),
1306 'uploader': u'arte.tv',
1307 'upload_date': info.get('date'),
1308 'title': info.get('title').decode('utf-8'),
1314 def _real_extract(self, url):
1315 video_id = url.split('/')[-1]
1316 self.report_extraction(video_id)
1318 if re.search(self._LIVE_URL, video_id) is not None:
1319 self.extractLiveStream(url)
1322 info = self.extractPlus7Stream(url)
1327 class GenericIE(InfoExtractor):
1328 """Generic last-resort information extractor."""
1331 IE_NAME = u'generic'
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1336 def report_download_webpage(self, video_id):
1337 """Report webpage download."""
1338 if not self._downloader.params.get('test', False):
1339 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1342 def report_extraction(self, video_id):
1343 """Report information extraction."""
1344 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1346 def report_following_redirect(self, new_url):
1347 """Report information extraction."""
1348 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1350 def _test_redirect(self, url):
1351 """Check if it is a redirect, like url shorteners, in case restart chain."""
1352 class HeadRequest(compat_urllib_request.Request):
1353 def get_method(self):
1356 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1358 Subclass the HTTPRedirectHandler to make it use our
1359 HeadRequest also on the redirected URL
1361 def redirect_request(self, req, fp, code, msg, headers, newurl):
1362 if code in (301, 302, 303, 307):
1363 newurl = newurl.replace(' ', '%20')
1364 newheaders = dict((k,v) for k,v in req.headers.items()
1365 if k.lower() not in ("content-length", "content-type"))
1366 return HeadRequest(newurl,
1368 origin_req_host=req.get_origin_req_host(),
1371 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1373 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1375 Fallback to GET if HEAD is not allowed (405 HTTP error)
1377 def http_error_405(self, req, fp, code, msg, headers):
1381 newheaders = dict((k,v) for k,v in req.headers.items()
1382 if k.lower() not in ("content-length", "content-type"))
1383 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1385 origin_req_host=req.get_origin_req_host(),
1389 opener = compat_urllib_request.OpenerDirector()
1390 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391 HTTPMethodFallback, HEADRedirectHandler,
1392 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393 opener.add_handler(handler())
1395 response = opener.open(HeadRequest(url))
1396 new_url = response.geturl()
1401 self.report_following_redirect(new_url)
1402 self._downloader.download([new_url])
1405 def _real_extract(self, url):
1406 if self._test_redirect(url): return
1408 video_id = url.split('/')[-1]
1410 webpage = self._download_webpage(url, video_id)
1411 except ValueError as err:
1412 # since this is the last-resort InfoExtractor, if
1413 # this error is thrown, it'll be thrown here
1414 self._downloader.report_error(u'Invalid URL: %s' % url)
1417 self.report_extraction(video_id)
1418 # Start with something easy: JW Player in SWFObject
1419 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1421 # Broaden the search a little bit
1422 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1424 # Broaden the search a little bit: JWPlayer JS loader
1425 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1427 self._downloader.report_error(u'Invalid URL: %s' % url)
1430 # It's possible that one of the regexes
1431 # matched, but returned an empty group:
1432 if mobj.group(1) is None:
1433 self._downloader.report_error(u'Invalid URL: %s' % url)
1436 video_url = compat_urllib_parse.unquote(mobj.group(1))
1437 video_id = os.path.basename(video_url)
1439 # here's a fun little line of code for you:
1440 video_extension = os.path.splitext(video_id)[1][1:]
1441 video_id = os.path.splitext(video_id)[0]
1443 # it's tempting to parse this further, but you would
1444 # have to take into account all the variations like
1445 # Video Title - Site Name
1446 # Site Name | Video Title
1447 # Video Title - Tagline | Site Name
1448 # and so on and so forth; it's just not practical
1449 mobj = re.search(r'<title>(.*)</title>', webpage)
1451 self._downloader.report_error(u'unable to extract title')
1453 video_title = mobj.group(1)
1455 # video uploader is domain name
1456 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1458 self._downloader.report_error(u'unable to extract title')
1460 video_uploader = mobj.group(1)
1465 'uploader': video_uploader,
1466 'upload_date': None,
1467 'title': video_title,
1468 'ext': video_extension,
1472 class YoutubeSearchIE(InfoExtractor):
1473 """Information Extractor for YouTube search queries."""
1474 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476 _max_youtube_results = 1000
1477 IE_NAME = u'youtube:search'
1479 def __init__(self, downloader=None):
1480 InfoExtractor.__init__(self, downloader)
1482 def report_download_page(self, query, pagenum):
1483 """Report attempt to download search page with given number."""
1484 query = query.decode(preferredencoding())
1485 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1487 def _real_extract(self, query):
1488 mobj = re.match(self._VALID_URL, query)
1490 self._downloader.report_error(u'invalid search query "%s"' % query)
1493 prefix, query = query.split(':')
1495 query = query.encode('utf-8')
1497 self._download_n_results(query, 1)
1499 elif prefix == 'all':
1500 self._download_n_results(query, self._max_youtube_results)
1506 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1508 elif n > self._max_youtube_results:
1509 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510 n = self._max_youtube_results
1511 self._download_n_results(query, n)
1513 except ValueError: # parsing prefix as integer fails
1514 self._download_n_results(query, 1)
1517 def _download_n_results(self, query, n):
1518 """Downloads a specified number of results for a query"""
1524 while (50 * pagenum) < limit:
1525 self.report_download_page(query, pagenum+1)
1526 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527 request = compat_urllib_request.Request(result_url)
1529 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1533 api_response = json.loads(data)['data']
1535 if not 'items' in api_response:
1536 self._downloader.trouble(u'[youtube] No video results')
1539 new_ids = list(video['id'] for video in api_response['items'])
1540 video_ids += new_ids
1542 limit = min(n, api_response['totalItems'])
1545 if len(video_ids) > n:
1546 video_ids = video_ids[:n]
1547 for id in video_ids:
1548 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1552 class GoogleSearchIE(InfoExtractor):
1553 """Information Extractor for Google Video search queries."""
1554 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558 _max_google_results = 1000
1559 IE_NAME = u'video.google:search'
1561 def __init__(self, downloader=None):
1562 InfoExtractor.__init__(self, downloader)
1564 def report_download_page(self, query, pagenum):
1565 """Report attempt to download playlist page with given number."""
1566 query = query.decode(preferredencoding())
1567 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1569 def _real_extract(self, query):
1570 mobj = re.match(self._VALID_URL, query)
1572 self._downloader.report_error(u'invalid search query "%s"' % query)
1575 prefix, query = query.split(':')
1577 query = query.encode('utf-8')
1579 self._download_n_results(query, 1)
1581 elif prefix == 'all':
1582 self._download_n_results(query, self._max_google_results)
1588 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1590 elif n > self._max_google_results:
1591 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592 n = self._max_google_results
1593 self._download_n_results(query, n)
1595 except ValueError: # parsing prefix as integer fails
1596 self._download_n_results(query, 1)
1599 def _download_n_results(self, query, n):
1600 """Downloads a specified number of results for a query"""
1606 self.report_download_page(query, pagenum)
1607 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608 request = compat_urllib_request.Request(result_url)
1610 page = compat_urllib_request.urlopen(request).read()
1611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1615 # Extract video identifiers
1616 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617 video_id = mobj.group(1)
1618 if video_id not in video_ids:
1619 video_ids.append(video_id)
1620 if len(video_ids) == n:
1621 # Specified n videos reached
1622 for id in video_ids:
1623 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1626 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627 for id in video_ids:
1628 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1631 pagenum = pagenum + 1
1634 class YahooSearchIE(InfoExtractor):
1635 """Information Extractor for Yahoo! Video search queries."""
1638 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641 _MORE_PAGES_INDICATOR = r'\s*Next'
1642 _max_yahoo_results = 1000
1643 IE_NAME = u'video.yahoo:search'
1645 def __init__(self, downloader=None):
1646 InfoExtractor.__init__(self, downloader)
1648 def report_download_page(self, query, pagenum):
1649 """Report attempt to download playlist page with given number."""
1650 query = query.decode(preferredencoding())
1651 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1653 def _real_extract(self, query):
1654 mobj = re.match(self._VALID_URL, query)
1656 self._downloader.report_error(u'invalid search query "%s"' % query)
1659 prefix, query = query.split(':')
1661 query = query.encode('utf-8')
1663 self._download_n_results(query, 1)
1665 elif prefix == 'all':
1666 self._download_n_results(query, self._max_yahoo_results)
1672 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1674 elif n > self._max_yahoo_results:
1675 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676 n = self._max_yahoo_results
1677 self._download_n_results(query, n)
1679 except ValueError: # parsing prefix as integer fails
1680 self._download_n_results(query, 1)
1683 def _download_n_results(self, query, n):
1684 """Downloads a specified number of results for a query"""
1687 already_seen = set()
1691 self.report_download_page(query, pagenum)
1692 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693 request = compat_urllib_request.Request(result_url)
1695 page = compat_urllib_request.urlopen(request).read()
1696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1700 # Extract video identifiers
1701 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702 video_id = mobj.group(1)
1703 if video_id not in already_seen:
1704 video_ids.append(video_id)
1705 already_seen.add(video_id)
1706 if len(video_ids) == n:
1707 # Specified n videos reached
1708 for id in video_ids:
1709 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1712 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713 for id in video_ids:
1714 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1717 pagenum = pagenum + 1
1720 class YoutubePlaylistIE(InfoExtractor):
1721 """Information Extractor for YouTube playlists."""
1723 _VALID_URL = r"""(?:
1728 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729 \? (?:.*?&)*? (?:p|a|list)=
1732 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1735 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1737 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1739 IE_NAME = u'youtube:playlist'
1741 def __init__(self, downloader=None):
1742 InfoExtractor.__init__(self, downloader)
1745 def suitable(cls, url):
1746 """Receives a URL and returns True if suitable for this IE."""
1747 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1749 def report_download_page(self, playlist_id, pagenum):
1750 """Report attempt to download playlist page with given number."""
1751 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1753 def _real_extract(self, url):
1754 # Extract playlist id
1755 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1757 self._downloader.report_error(u'invalid url: %s' % url)
1760 # Download playlist videos from API
1761 playlist_id = mobj.group(1) or mobj.group(2)
1766 self.report_download_page(playlist_id, page_num)
1768 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1770 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1776 response = json.loads(page)
1777 except ValueError as err:
1778 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1781 if not 'feed' in response or not 'entry' in response['feed']:
1782 self._downloader.report_error(u'Got a malformed response from YouTube API')
1784 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1785 for entry in response['feed']['entry']
1786 if 'content' in entry ]
1788 if len(response['feed']['entry']) < self._MAX_RESULTS:
1792 videos = [v[1] for v in sorted(videos)]
1795 playliststart = self._downloader.params.get('playliststart', 1) - 1
1796 playlistend = self._downloader.params.get('playlistend', -1)
1797 if playlistend == -1:
1798 videos = videos[playliststart:]
1800 videos = videos[playliststart:playlistend]
1802 if len(videos) == total:
1803 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1805 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1807 for video in videos:
1808 self._downloader.download([video])
1812 class YoutubeChannelIE(InfoExtractor):
1813 """Information Extractor for YouTube channels."""
1815 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1816 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1817 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1818 IE_NAME = u'youtube:channel'
1820 def report_download_page(self, channel_id, pagenum):
1821 """Report attempt to download channel page with given number."""
1822 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1824 def _real_extract(self, url):
1825 # Extract channel id
1826 mobj = re.match(self._VALID_URL, url)
1828 self._downloader.report_error(u'invalid url: %s' % url)
1831 # Download channel pages
1832 channel_id = mobj.group(1)
1837 self.report_download_page(channel_id, pagenum)
1838 url = self._TEMPLATE_URL % (channel_id, pagenum)
1839 request = compat_urllib_request.Request(url)
1841 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1843 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1846 # Extract video identifiers
1848 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1849 if mobj.group(1) not in ids_in_page:
1850 ids_in_page.append(mobj.group(1))
1851 video_ids.extend(ids_in_page)
1853 if self._MORE_PAGES_INDICATOR not in page:
1855 pagenum = pagenum + 1
1857 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1859 for id in video_ids:
1860 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1864 class YoutubeUserIE(InfoExtractor):
1865 """Information Extractor for YouTube users."""
1867 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1868 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1869 _GDATA_PAGE_SIZE = 50
1870 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1871 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1872 IE_NAME = u'youtube:user'
1874 def __init__(self, downloader=None):
1875 InfoExtractor.__init__(self, downloader)
1877 def report_download_page(self, username, start_index):
1878 """Report attempt to download user page."""
1879 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1880 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1882 def _real_extract(self, url):
1884 mobj = re.match(self._VALID_URL, url)
1886 self._downloader.report_error(u'invalid url: %s' % url)
1889 username = mobj.group(1)
1891 # Download video ids using YouTube Data API. Result size per
1892 # query is limited (currently to 50 videos) so we need to query
1893 # page by page until there are no video ids - it means we got
1900 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1901 self.report_download_page(username, start_index)
1903 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1906 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1911 # Extract video identifiers
1914 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915 if mobj.group(1) not in ids_in_page:
1916 ids_in_page.append(mobj.group(1))
1918 video_ids.extend(ids_in_page)
1920 # A little optimization - if current page is not
1921 # "full", ie. does not contain PAGE_SIZE video ids then
1922 # we can assume that this page is the last one - there
1923 # are no more ids on further pages - no need to query
1926 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1931 all_ids_count = len(video_ids)
1932 playliststart = self._downloader.params.get('playliststart', 1) - 1
1933 playlistend = self._downloader.params.get('playlistend', -1)
1935 if playlistend == -1:
1936 video_ids = video_ids[playliststart:]
1938 video_ids = video_ids[playliststart:playlistend]
1940 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1941 (username, all_ids_count, len(video_ids)))
1943 for video_id in video_ids:
1944 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1947 class BlipTVUserIE(InfoExtractor):
1948 """Information Extractor for blip.tv users."""
1950 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1952 IE_NAME = u'blip.tv:user'
1954 def __init__(self, downloader=None):
1955 InfoExtractor.__init__(self, downloader)
1957 def report_download_page(self, username, pagenum):
1958 """Report attempt to download user page."""
1959 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960 (self.IE_NAME, username, pagenum))
1962 def _real_extract(self, url):
1964 mobj = re.match(self._VALID_URL, url)
1966 self._downloader.report_error(u'invalid url: %s' % url)
1969 username = mobj.group(1)
1971 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1973 request = compat_urllib_request.Request(url)
1976 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977 mobj = re.search(r'data-users-id="([^"]+)"', page)
1978 page_base = page_base % mobj.group(1)
1979 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1984 # Download video ids using BlipTV Ajax calls. Result size per
1985 # query is limited (currently to 12 videos) so we need to query
1986 # page by page until there are no video ids - it means we got
1993 self.report_download_page(username, pagenum)
1994 url = page_base + "&page=" + str(pagenum)
1995 request = compat_urllib_request.Request( url )
1997 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2002 # Extract video identifiers
2005 for mobj in re.finditer(r'href="/([^"]+)"', page):
2006 if mobj.group(1) not in ids_in_page:
2007 ids_in_page.append(unescapeHTML(mobj.group(1)))
2009 video_ids.extend(ids_in_page)
2011 # A little optimization - if current page is not
2012 # "full", ie. does not contain PAGE_SIZE video ids then
2013 # we can assume that this page is the last one - there
2014 # are no more ids on further pages - no need to query
2017 if len(ids_in_page) < self._PAGE_SIZE:
2022 all_ids_count = len(video_ids)
2023 playliststart = self._downloader.params.get('playliststart', 1) - 1
2024 playlistend = self._downloader.params.get('playlistend', -1)
2026 if playlistend == -1:
2027 video_ids = video_ids[playliststart:]
2029 video_ids = video_ids[playliststart:playlistend]
2031 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2032 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2034 for video_id in video_ids:
2035 self._downloader.download([u'http://blip.tv/'+video_id])
2038 class DepositFilesIE(InfoExtractor):
2039 """Information extractor for depositfiles.com"""
2041 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2043 def report_download_webpage(self, file_id):
2044 """Report webpage download."""
2045 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2047 def report_extraction(self, file_id):
2048 """Report information extraction."""
2049 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2051 def _real_extract(self, url):
2052 file_id = url.split('/')[-1]
2053 # Rebuild url in english locale
2054 url = 'http://depositfiles.com/en/files/' + file_id
2056 # Retrieve file webpage with 'Free download' button pressed
2057 free_download_indication = { 'gateway_result' : '1' }
2058 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2060 self.report_download_webpage(file_id)
2061 webpage = compat_urllib_request.urlopen(request).read()
2062 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2063 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2066 # Search for the real file URL
2067 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2068 if (mobj is None) or (mobj.group(1) is None):
2069 # Try to figure out reason of the error.
2070 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2071 if (mobj is not None) and (mobj.group(1) is not None):
2072 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2073 self._downloader.report_error(u'%s' % restriction_message)
2075 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2078 file_url = mobj.group(1)
2079 file_extension = os.path.splitext(file_url)[1][1:]
2081 # Search for file title
2082 mobj = re.search(r'<b title="(.*?)">', webpage)
2084 self._downloader.report_error(u'unable to extract title')
2086 file_title = mobj.group(1).decode('utf-8')
2089 'id': file_id.decode('utf-8'),
2090 'url': file_url.decode('utf-8'),
2092 'upload_date': None,
2093 'title': file_title,
2094 'ext': file_extension.decode('utf-8'),
2098 class FacebookIE(InfoExtractor):
2099 """Information Extractor for Facebook"""
2101 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2102 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2103 _NETRC_MACHINE = 'facebook'
2104 IE_NAME = u'facebook'
2106 def report_login(self):
2107 """Report attempt to log in."""
2108 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2110 def _real_initialize(self):
2111 if self._downloader is None:
2116 downloader_params = self._downloader.params
2118 # Attempt to use provided username and password or .netrc data
2119 if downloader_params.get('username', None) is not None:
2120 useremail = downloader_params['username']
2121 password = downloader_params['password']
2122 elif downloader_params.get('usenetrc', False):
2124 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2125 if info is not None:
2129 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2130 except (IOError, netrc.NetrcParseError) as err:
2131 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2134 if useremail is None:
2143 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2146 login_results = compat_urllib_request.urlopen(request).read()
2147 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2148 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2150 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2154 def _real_extract(self, url):
2155 mobj = re.match(self._VALID_URL, url)
2157 self._downloader.report_error(u'invalid URL: %s' % url)
2159 video_id = mobj.group('ID')
2161 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2162 webpage = self._download_webpage(url, video_id)
2164 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2165 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2166 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2168 raise ExtractorError(u'Cannot parse data')
2169 data = dict(json.loads(m.group(1)))
2170 params_raw = compat_urllib_parse.unquote(data['params'])
2171 params = json.loads(params_raw)
2172 video_data = params['video_data'][0]
2173 video_url = video_data.get('hd_src')
2175 video_url = video_data['sd_src']
2177 raise ExtractorError(u'Cannot find video URL')
2178 video_duration = int(video_data['video_duration'])
2179 thumbnail = video_data['thumbnail_src']
2181 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2183 raise ExtractorError(u'Cannot find title in webpage')
2184 video_title = unescapeHTML(m.group(1))
2188 'title': video_title,
2191 'duration': video_duration,
2192 'thumbnail': thumbnail,
2197 class BlipTVIE(InfoExtractor):
2198 """Information extractor for blip.tv"""
2200 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2201 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2202 IE_NAME = u'blip.tv'
2204 def report_extraction(self, file_id):
2205 """Report information extraction."""
2206 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2208 def report_direct_download(self, title):
2209 """Report information extraction."""
2210 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2212 def _real_extract(self, url):
2213 mobj = re.match(self._VALID_URL, url)
2215 self._downloader.report_error(u'invalid URL: %s' % url)
2218 urlp = compat_urllib_parse_urlparse(url)
2219 if urlp.path.startswith('/play/'):
2220 request = compat_urllib_request.Request(url)
2221 response = compat_urllib_request.urlopen(request)
2222 redirecturl = response.geturl()
2223 rurlp = compat_urllib_parse_urlparse(redirecturl)
2224 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2225 url = 'http://blip.tv/a/a-' + file_id
2226 return self._real_extract(url)
2233 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2234 request = compat_urllib_request.Request(json_url)
2235 request.add_header('User-Agent', 'iTunes/10.6.1')
2236 self.report_extraction(mobj.group(1))
2239 urlh = compat_urllib_request.urlopen(request)
2240 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2241 basename = url.split('/')[-1]
2242 title,ext = os.path.splitext(basename)
2243 title = title.decode('UTF-8')
2244 ext = ext.replace('.', '')
2245 self.report_direct_download(title)
2250 'upload_date': None,
2255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2256 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2257 if info is None: # Regular URL
2259 json_code_bytes = urlh.read()
2260 json_code = json_code_bytes.decode('utf-8')
2261 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2266 json_data = json.loads(json_code)
2267 if 'Post' in json_data:
2268 data = json_data['Post']
2272 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2273 video_url = data['media']['url']
2274 umobj = re.match(self._URL_EXT, video_url)
2276 raise ValueError('Can not determine filename extension')
2277 ext = umobj.group(1)
2280 'id': data['item_id'],
2282 'uploader': data['display_name'],
2283 'upload_date': upload_date,
2284 'title': data['title'],
2286 'format': data['media']['mimeType'],
2287 'thumbnail': data['thumbnailUrl'],
2288 'description': data['description'],
2289 'player_url': data['embedUrl'],
2290 'user_agent': 'iTunes/10.6.1',
2292 except (ValueError,KeyError) as err:
2293 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2299 class MyVideoIE(InfoExtractor):
2300 """Information Extractor for myvideo.de."""
2302 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2303 IE_NAME = u'myvideo'
2305 def __init__(self, downloader=None):
2306 InfoExtractor.__init__(self, downloader)
2308 def report_extraction(self, video_id):
2309 """Report information extraction."""
2310 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2312 def _real_extract(self,url):
2313 mobj = re.match(self._VALID_URL, url)
2315 self._download.report_error(u'invalid URL: %s' % url)
2318 video_id = mobj.group(1)
2321 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2322 webpage = self._download_webpage(webpage_url, video_id)
2324 self.report_extraction(video_id)
2325 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2328 self._downloader.report_error(u'unable to extract media URL')
2330 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2332 mobj = re.search('<title>([^<]+)</title>', webpage)
2334 self._downloader.report_error(u'unable to extract title')
2337 video_title = mobj.group(1)
2343 'upload_date': None,
2344 'title': video_title,
2348 class ComedyCentralIE(InfoExtractor):
2349 """Information extractor for The Daily Show and Colbert Report """
2351 # urls can be abbreviations like :thedailyshow or :colbert
2352 # urls for episodes like:
2353 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2354 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2355 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2356 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2357 |(https?://)?(www\.)?
2358 (?P<showname>thedailyshow|colbertnation)\.com/
2359 (full-episodes/(?P<episode>.*)|
2361 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2362 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2365 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2367 _video_extensions = {
2375 _video_dimensions = {
2385 def suitable(cls, url):
2386 """Receives a URL and returns True if suitable for this IE."""
2387 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2389 def report_extraction(self, episode_id):
2390 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2392 def report_config_download(self, episode_id, media_id):
2393 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2395 def report_index_download(self, episode_id):
2396 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2398 def _print_formats(self, formats):
2399 print('Available formats:')
2401 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2404 def _real_extract(self, url):
2405 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2407 self._downloader.report_error(u'invalid URL: %s' % url)
2410 if mobj.group('shortname'):
2411 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2412 url = u'http://www.thedailyshow.com/full-episodes/'
2414 url = u'http://www.colbertnation.com/full-episodes/'
2415 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2416 assert mobj is not None
2418 if mobj.group('clip'):
2419 if mobj.group('showname') == 'thedailyshow':
2420 epTitle = mobj.group('tdstitle')
2422 epTitle = mobj.group('cntitle')
2425 dlNewest = not mobj.group('episode')
2427 epTitle = mobj.group('showname')
2429 epTitle = mobj.group('episode')
2431 req = compat_urllib_request.Request(url)
2432 self.report_extraction(epTitle)
2434 htmlHandle = compat_urllib_request.urlopen(req)
2435 html = htmlHandle.read()
2436 webpage = html.decode('utf-8')
2437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2438 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2441 url = htmlHandle.geturl()
2442 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2444 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2446 if mobj.group('episode') == '':
2447 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2449 epTitle = mobj.group('episode')
2451 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2453 if len(mMovieParams) == 0:
2454 # The Colbert Report embeds the information in a without
2455 # a URL prefix; so extract the alternate reference
2456 # and then add the URL prefix manually.
2458 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2459 if len(altMovieParams) == 0:
2460 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2463 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2465 uri = mMovieParams[0][1]
2466 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2467 self.report_index_download(epTitle)
2469 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2476 idoc = xml.etree.ElementTree.fromstring(indexXml)
2477 itemEls = idoc.findall('.//item')
2478 for partNum,itemEl in enumerate(itemEls):
2479 mediaId = itemEl.findall('./guid')[0].text
2480 shortMediaId = mediaId.split(':')[-1]
2481 showId = mediaId.split(':')[-2].replace('.com', '')
2482 officialTitle = itemEl.findall('./title')[0].text
2483 officialDate = itemEl.findall('./pubDate')[0].text
2485 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2486 compat_urllib_parse.urlencode({'uri': mediaId}))
2487 configReq = compat_urllib_request.Request(configUrl)
2488 self.report_config_download(epTitle, shortMediaId)
2490 configXml = compat_urllib_request.urlopen(configReq).read()
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2495 cdoc = xml.etree.ElementTree.fromstring(configXml)
2497 for rendition in cdoc.findall('.//rendition'):
2498 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2502 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2505 if self._downloader.params.get('listformats', None):
2506 self._print_formats([i[0] for i in turls])
2509 # For now, just pick the highest bitrate
2510 format,rtmp_video_url = turls[-1]
2512 # Get the format arg from the arg stream
2513 req_format = self._downloader.params.get('format', None)
2515 # Select format if we can find one
2518 format, rtmp_video_url = f, v
2521 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2523 raise ExtractorError(u'Cannot transform RTMP url')
2524 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2525 video_url = base + m.group('finalid')
2527 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2532 'upload_date': officialDate,
2537 'description': officialTitle,
2539 results.append(info)
2544 class EscapistIE(InfoExtractor):
2545 """Information extractor for The Escapist """
2547 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2548 IE_NAME = u'escapist'
2550 def report_extraction(self, showName):
2551 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2553 def report_config_download(self, showName):
2554 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2556 def _real_extract(self, url):
2557 mobj = re.match(self._VALID_URL, url)
2559 self._downloader.report_error(u'invalid URL: %s' % url)
2561 showName = mobj.group('showname')
2562 videoId = mobj.group('episode')
2564 self.report_extraction(showName)
2566 webPage = compat_urllib_request.urlopen(url)
2567 webPageBytes = webPage.read()
2568 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2569 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2570 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2574 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2575 description = unescapeHTML(descMatch.group(1))
2576 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2577 imgUrl = unescapeHTML(imgMatch.group(1))
2578 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2579 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2580 configUrlMatch = re.search('config=(.*)$', playerUrl)
2581 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2583 self.report_config_download(showName)
2585 configJSON = compat_urllib_request.urlopen(configUrl)
2586 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2587 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2589 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2592 # Technically, it's JavaScript, not JSON
2593 configJSON = configJSON.replace("'", '"')
2596 config = json.loads(configJSON)
2597 except (ValueError,) as err:
2598 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2601 playlist = config['playlist']
2602 videoUrl = playlist[1]['url']
2607 'uploader': showName,
2608 'upload_date': None,
2611 'thumbnail': imgUrl,
2612 'description': description,
2613 'player_url': playerUrl,
2618 class CollegeHumorIE(InfoExtractor):
2619 """Information extractor for collegehumor.com"""
2622 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2623 IE_NAME = u'collegehumor'
2625 def report_manifest(self, video_id):
2626 """Report information extraction."""
2627 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2629 def report_extraction(self, video_id):
2630 """Report information extraction."""
2631 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2633 def _real_extract(self, url):
2634 mobj = re.match(self._VALID_URL, url)
2636 self._downloader.report_error(u'invalid URL: %s' % url)
2638 video_id = mobj.group('videoid')
2643 'upload_date': None,
2646 self.report_extraction(video_id)
2647 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2649 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2650 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2654 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2656 videoNode = mdoc.findall('./video')[0]
2657 info['description'] = videoNode.findall('./description')[0].text
2658 info['title'] = videoNode.findall('./caption')[0].text
2659 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2660 manifest_url = videoNode.findall('./file')[0].text
2662 self._downloader.report_error(u'Invalid metadata XML file')
2665 manifest_url += '?hdcore=2.10.3'
2666 self.report_manifest(video_id)
2668 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2670 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2673 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2675 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2676 node_id = media_node.attrib['url']
2677 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2678 except IndexError as err:
2679 self._downloader.report_error(u'Invalid manifest file')
2682 url_pr = compat_urllib_parse_urlparse(manifest_url)
2683 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2690 class XVideosIE(InfoExtractor):
2691 """Information extractor for xvideos.com"""
2693 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2694 IE_NAME = u'xvideos'
2696 def report_extraction(self, video_id):
2697 """Report information extraction."""
2698 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2700 def _real_extract(self, url):
2701 mobj = re.match(self._VALID_URL, url)
2703 self._downloader.report_error(u'invalid URL: %s' % url)
2705 video_id = mobj.group(1)
2707 webpage = self._download_webpage(url, video_id)
2709 self.report_extraction(video_id)
2713 mobj = re.search(r'flv_url=(.+?)&', webpage)
2715 self._downloader.report_error(u'unable to extract video url')
2717 video_url = compat_urllib_parse.unquote(mobj.group(1))
2721 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2723 self._downloader.report_error(u'unable to extract video title')
2725 video_title = mobj.group(1)
2728 # Extract video thumbnail
2729 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2731 self._downloader.report_error(u'unable to extract video thumbnail')
2733 video_thumbnail = mobj.group(0)
2739 'upload_date': None,
2740 'title': video_title,
2742 'thumbnail': video_thumbnail,
2743 'description': None,
2749 class SoundcloudIE(InfoExtractor):
2750 """Information extractor for soundcloud.com
2751 To access the media, the uid of the song and a stream token
2752 must be extracted from the page source and the script must make
2753 a request to media.soundcloud.com/crossdomain.xml. Then
2754 the media can be grabbed by requesting from an url composed
2755 of the stream token and uid
2758 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2759 IE_NAME = u'soundcloud'
2761 def __init__(self, downloader=None):
2762 InfoExtractor.__init__(self, downloader)
2764 def report_resolve(self, video_id):
2765 """Report information extraction."""
2766 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2768 def report_extraction(self, video_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2772 def _real_extract(self, url):
2773 mobj = re.match(self._VALID_URL, url)
2775 self._downloader.report_error(u'invalid URL: %s' % url)
2778 # extract uploader (which is in the url)
2779 uploader = mobj.group(1)
2780 # extract simple title (uploader + slug of song title)
2781 slug_title = mobj.group(2)
2782 simple_title = uploader + u'-' + slug_title
2784 self.report_resolve('%s/%s' % (uploader, slug_title))
2786 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2787 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2788 request = compat_urllib_request.Request(resolv_url)
2790 info_json_bytes = compat_urllib_request.urlopen(request).read()
2791 info_json = info_json_bytes.decode('utf-8')
2792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2793 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2796 info = json.loads(info_json)
2797 video_id = info['id']
2798 self.report_extraction('%s/%s' % (uploader, slug_title))
2800 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2801 request = compat_urllib_request.Request(streams_url)
2803 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2804 stream_json = stream_json_bytes.decode('utf-8')
2805 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2806 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2809 streams = json.loads(stream_json)
2810 mediaURL = streams['http_mp3_128_url']
2815 'uploader': info['user']['username'],
2816 'upload_date': info['created_at'],
2817 'title': info['title'],
2819 'description': info['description'],
2822 class SoundcloudSetIE(InfoExtractor):
2823 """Information extractor for soundcloud.com sets
2824 To access the media, the uid of the song and a stream token
2825 must be extracted from the page source and the script must make
2826 a request to media.soundcloud.com/crossdomain.xml. Then
2827 the media can be grabbed by requesting from an url composed
2828 of the stream token and uid
2831 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2832 IE_NAME = u'soundcloud'
2834 def __init__(self, downloader=None):
2835 InfoExtractor.__init__(self, downloader)
2837 def report_resolve(self, video_id):
2838 """Report information extraction."""
2839 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2841 def report_extraction(self, video_id):
2842 """Report information extraction."""
2843 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2845 def _real_extract(self, url):
2846 mobj = re.match(self._VALID_URL, url)
2848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2851 # extract uploader (which is in the url)
2852 uploader = mobj.group(1)
2853 # extract simple title (uploader + slug of song title)
2854 slug_title = mobj.group(2)
2855 simple_title = uploader + u'-' + slug_title
2857 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2859 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2860 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2861 request = compat_urllib_request.Request(resolv_url)
2863 info_json_bytes = compat_urllib_request.urlopen(request).read()
2864 info_json = info_json_bytes.decode('utf-8')
2865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2870 info = json.loads(info_json)
2871 if 'errors' in info:
2872 for err in info['errors']:
2873 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2876 for track in info['tracks']:
2877 video_id = track['id']
2878 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2880 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2881 request = compat_urllib_request.Request(streams_url)
2883 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2884 stream_json = stream_json_bytes.decode('utf-8')
2885 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2886 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2889 streams = json.loads(stream_json)
2890 mediaURL = streams['http_mp3_128_url']
2895 'uploader': track['user']['username'],
2896 'upload_date': track['created_at'],
2897 'title': track['title'],
2899 'description': track['description'],
2904 class InfoQIE(InfoExtractor):
2905 """Information extractor for infoq.com"""
2906 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2908 def report_extraction(self, video_id):
2909 """Report information extraction."""
2910 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2912 def _real_extract(self, url):
2913 mobj = re.match(self._VALID_URL, url)
2915 self._downloader.report_error(u'invalid URL: %s' % url)
2918 webpage = self._download_webpage(url, video_id=url)
2919 self.report_extraction(url)
2922 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2924 self._downloader.report_error(u'unable to extract video url')
2926 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2927 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2930 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2932 self._downloader.report_error(u'unable to extract video title')
2934 video_title = mobj.group(1)
2936 # Extract description
2937 video_description = u'No description available.'
2938 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2939 if mobj is not None:
2940 video_description = mobj.group(1)
2942 video_filename = video_url.split('/')[-1]
2943 video_id, extension = video_filename.split('.')
2949 'upload_date': None,
2950 'title': video_title,
2951 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2953 'description': video_description,
2958 class MixcloudIE(InfoExtractor):
2959 """Information extractor for www.mixcloud.com"""
2961 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2962 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2963 IE_NAME = u'mixcloud'
2965 def __init__(self, downloader=None):
2966 InfoExtractor.__init__(self, downloader)
2968 def report_download_json(self, file_id):
2969 """Report JSON download."""
2970 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2972 def report_extraction(self, file_id):
2973 """Report information extraction."""
2974 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2976 def get_urls(self, jsonData, fmt, bitrate='best'):
2977 """Get urls from 'audio_formats' section in json"""
2980 bitrate_list = jsonData[fmt]
2981 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2982 bitrate = max(bitrate_list) # select highest
2984 url_list = jsonData[fmt][bitrate]
2985 except TypeError: # we have no bitrate info.
2986 url_list = jsonData[fmt]
2989 def check_urls(self, url_list):
2990 """Returns 1st active url from list"""
2991 for url in url_list:
2993 compat_urllib_request.urlopen(url)
2995 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3000 def _print_formats(self, formats):
3001 print('Available formats:')
3002 for fmt in formats.keys():
3003 for b in formats[fmt]:
3005 ext = formats[fmt][b][0]
3006 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3007 except TypeError: # we have no bitrate info
3008 ext = formats[fmt][0]
3009 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3012 def _real_extract(self, url):
3013 mobj = re.match(self._VALID_URL, url)
3015 self._downloader.report_error(u'invalid URL: %s' % url)
3017 # extract uploader & filename from url
3018 uploader = mobj.group(1).decode('utf-8')
3019 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3021 # construct API request
3022 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3023 # retrieve .json file with links to files
3024 request = compat_urllib_request.Request(file_url)
3026 self.report_download_json(file_url)
3027 jsonData = compat_urllib_request.urlopen(request).read()
3028 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3029 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3033 json_data = json.loads(jsonData)
3034 player_url = json_data['player_swf_url']
3035 formats = dict(json_data['audio_formats'])
3037 req_format = self._downloader.params.get('format', None)
3040 if self._downloader.params.get('listformats', None):
3041 self._print_formats(formats)
3044 if req_format is None or req_format == 'best':
3045 for format_param in formats.keys():
3046 url_list = self.get_urls(formats, format_param)
3048 file_url = self.check_urls(url_list)
3049 if file_url is not None:
3052 if req_format not in formats:
3053 self._downloader.report_error(u'format is not available')
3056 url_list = self.get_urls(formats, req_format)
3057 file_url = self.check_urls(url_list)
3058 format_param = req_format
3061 'id': file_id.decode('utf-8'),
3062 'url': file_url.decode('utf-8'),
3063 'uploader': uploader.decode('utf-8'),
3064 'upload_date': None,
3065 'title': json_data['name'],
3066 'ext': file_url.split('.')[-1].decode('utf-8'),
3067 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3068 'thumbnail': json_data['thumbnail_url'],
3069 'description': json_data['description'],
3070 'player_url': player_url.decode('utf-8'),
3073 class StanfordOpenClassroomIE(InfoExtractor):
3074 """Information extractor for Stanford's Open ClassRoom"""
3076 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3077 IE_NAME = u'stanfordoc'
3079 def report_download_webpage(self, objid):
3080 """Report information extraction."""
3081 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3083 def report_extraction(self, video_id):
3084 """Report information extraction."""
3085 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3087 def _real_extract(self, url):
3088 mobj = re.match(self._VALID_URL, url)
3090 raise ExtractorError(u'Invalid URL: %s' % url)
3092 if mobj.group('course') and mobj.group('video'): # A specific video
3093 course = mobj.group('course')
3094 video = mobj.group('video')
3096 'id': course + '_' + video,
3098 'upload_date': None,
3101 self.report_extraction(info['id'])
3102 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3103 xmlUrl = baseUrl + video + '.xml'
3105 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3106 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3107 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3109 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3111 info['title'] = mdoc.findall('./title')[0].text
3112 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3114 self._downloader.report_error(u'Invalid metadata XML file')
3116 info['ext'] = info['url'].rpartition('.')[2]
3118 elif mobj.group('course'): # A course page
3119 course = mobj.group('course')
3124 'upload_date': None,
3127 coursepage = self._download_webpage(url, info['id'],
3128 note='Downloading course info page',
3129 errnote='Unable to download course info page')
3131 m = re.search('<h1>([^<]+)</h1>', coursepage)
3133 info['title'] = unescapeHTML(m.group(1))
3135 info['title'] = info['id']
3137 m = re.search('<description>([^<]+)</description>', coursepage)
3139 info['description'] = unescapeHTML(m.group(1))
3141 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3144 'type': 'reference',
3145 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3149 for entry in info['list']:
3150 assert entry['type'] == 'reference'
3151 results += self.extract(entry['url'])
3155 'id': 'Stanford OpenClassroom',
3158 'upload_date': None,
3161 self.report_download_webpage(info['id'])
3162 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3164 rootpage = compat_urllib_request.urlopen(rootURL).read()
3165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3169 info['title'] = info['id']
3171 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3174 'type': 'reference',
3175 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3180 for entry in info['list']:
3181 assert entry['type'] == 'reference'
3182 results += self.extract(entry['url'])
3185 class MTVIE(InfoExtractor):
3186 """Information extractor for MTV.com"""
3188 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3191 def report_extraction(self, video_id):
3192 """Report information extraction."""
3193 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3195 def _real_extract(self, url):
3196 mobj = re.match(self._VALID_URL, url)
3198 self._downloader.report_error(u'invalid URL: %s' % url)
3200 if not mobj.group('proto'):
3201 url = 'http://' + url
3202 video_id = mobj.group('videoid')
3204 webpage = self._download_webpage(url, video_id)
3206 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3208 self._downloader.report_error(u'unable to extract song name')
3210 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3213 self._downloader.report_error(u'unable to extract performer')
3215 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3216 video_title = performer + ' - ' + song_name
3218 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3220 self._downloader.report_error(u'unable to mtvn_uri')
3222 mtvn_uri = mobj.group(1)
3224 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3226 self._downloader.report_error(u'unable to extract content id')
3228 content_id = mobj.group(1)
3230 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3231 self.report_extraction(video_id)
3232 request = compat_urllib_request.Request(videogen_url)
3234 metadataXml = compat_urllib_request.urlopen(request).read()
3235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3239 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3240 renditions = mdoc.findall('.//rendition')
3242 # For now, always pick the highest quality.
3243 rendition = renditions[-1]
3246 _,_,ext = rendition.attrib['type'].partition('/')
3247 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3248 video_url = rendition.find('./src').text
3250 self._downloader.trouble('Invalid rendition field.')
3256 'uploader': performer,
3257 'upload_date': None,
3258 'title': video_title,
3266 class YoukuIE(InfoExtractor):
3267 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3269 def report_download_webpage(self, file_id):
3270 """Report webpage download."""
3271 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3273 def report_extraction(self, file_id):
3274 """Report information extraction."""
3275 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3278 nowTime = int(time.time() * 1000)
3279 random1 = random.randint(1000,1998)
3280 random2 = random.randint(1000,9999)
3282 return "%d%d%d" %(nowTime,random1,random2)
3284 def _get_file_ID_mix_string(self, seed):
3286 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3288 for i in range(len(source)):
3289 seed = (seed * 211 + 30031 ) % 65536
3290 index = math.floor(seed / 65536 * len(source) )
3291 mixed.append(source[int(index)])
3292 source.remove(source[int(index)])
3293 #return ''.join(mixed)
3296 def _get_file_id(self, fileId, seed):
3297 mixed = self._get_file_ID_mix_string(seed)
3298 ids = fileId.split('*')
3302 realId.append(mixed[int(ch)])
3303 return ''.join(realId)
3305 def _real_extract(self, url):
3306 mobj = re.match(self._VALID_URL, url)
3308 self._downloader.report_error(u'invalid URL: %s' % url)
3310 video_id = mobj.group('ID')
3312 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3314 request = compat_urllib_request.Request(info_url, None, std_headers)
3316 self.report_download_webpage(video_id)
3317 jsondata = compat_urllib_request.urlopen(request).read()
3318 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3322 self.report_extraction(video_id)
3324 jsonstr = jsondata.decode('utf-8')
3325 config = json.loads(jsonstr)
3327 video_title = config['data'][0]['title']
3328 seed = config['data'][0]['seed']
3330 format = self._downloader.params.get('format', None)
3331 supported_format = list(config['data'][0]['streamfileids'].keys())
3333 if format is None or format == 'best':
3334 if 'hd2' in supported_format:
3339 elif format == 'worst':
3347 fileid = config['data'][0]['streamfileids'][format]
3348 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3349 except (UnicodeDecodeError, ValueError, KeyError):
3350 self._downloader.report_error(u'unable to extract info section')
3354 sid = self._gen_sid()
3355 fileid = self._get_file_id(fileid, seed)
3357 #column 8,9 of fileid represent the segment number
3358 #fileid[7:9] should be changed
3359 for index, key in enumerate(keys):
3361 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3362 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3365 'id': '%s_part%02d' % (video_id, index),
3366 'url': download_url,
3368 'upload_date': None,
3369 'title': video_title,
3372 files_info.append(info)
3377 class XNXXIE(InfoExtractor):
3378 """Information extractor for xnxx.com"""
3380 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3382 VIDEO_URL_RE = r'flv_url=(.*?)&'
3383 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3384 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3386 def report_webpage(self, video_id):
3387 """Report information extraction"""
3388 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3390 def report_extraction(self, video_id):
3391 """Report information extraction"""
3392 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3394 def _real_extract(self, url):
3395 mobj = re.match(self._VALID_URL, url)
3397 self._downloader.report_error(u'invalid URL: %s' % url)
3399 video_id = mobj.group(1)
3401 self.report_webpage(video_id)
3403 # Get webpage content
3405 webpage_bytes = compat_urllib_request.urlopen(url).read()
3406 webpage = webpage_bytes.decode('utf-8')
3407 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3408 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3411 result = re.search(self.VIDEO_URL_RE, webpage)
3413 self._downloader.report_error(u'unable to extract video url')
3415 video_url = compat_urllib_parse.unquote(result.group(1))
3417 result = re.search(self.VIDEO_TITLE_RE, webpage)
3419 self._downloader.report_error(u'unable to extract video title')
3421 video_title = result.group(1)
3423 result = re.search(self.VIDEO_THUMB_RE, webpage)
3425 self._downloader.report_error(u'unable to extract video thumbnail')
3427 video_thumbnail = result.group(1)
3433 'upload_date': None,
3434 'title': video_title,
3436 'thumbnail': video_thumbnail,
3437 'description': None,
3441 class GooglePlusIE(InfoExtractor):
3442 """Information extractor for plus.google.com."""
3444 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3445 IE_NAME = u'plus.google'
3447 def __init__(self, downloader=None):
3448 InfoExtractor.__init__(self, downloader)
3450 def report_extract_entry(self, url):
3451 """Report downloading extry"""
3452 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3454 def report_date(self, upload_date):
3455 """Report downloading extry"""
3456 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3458 def report_uploader(self, uploader):
3459 """Report downloading extry"""
3460 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3462 def report_title(self, video_title):
3463 """Report downloading extry"""
3464 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3466 def report_extract_vid_page(self, video_page):
3467 """Report information extraction."""
3468 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3470 def _real_extract(self, url):
3471 # Extract id from URL
3472 mobj = re.match(self._VALID_URL, url)
3474 self._downloader.report_error(u'Invalid URL: %s' % url)
3477 post_url = mobj.group(0)
3478 video_id = mobj.group(1)
3480 video_extension = 'flv'
3482 # Step 1, Retrieve post webpage to extract further information
3483 self.report_extract_entry(post_url)
3484 request = compat_urllib_request.Request(post_url)
3486 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3488 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3491 # Extract update date
3493 pattern = 'title="Timestamp">(.*?)</a>'
3494 mobj = re.search(pattern, webpage)
3496 upload_date = mobj.group(1)
3497 # Convert timestring to a format suitable for filename
3498 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3499 upload_date = upload_date.strftime('%Y%m%d')
3500 self.report_date(upload_date)
3504 pattern = r'rel\="author".*?>(.*?)</a>'
3505 mobj = re.search(pattern, webpage)
3507 uploader = mobj.group(1)
3508 self.report_uploader(uploader)
3511 # Get the first line for title
3513 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3514 mobj = re.search(pattern, webpage)
3516 video_title = mobj.group(1)
3517 self.report_title(video_title)
3519 # Step 2, Stimulate clicking the image box to launch video
3520 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3521 mobj = re.search(pattern, webpage)
3523 self._downloader.report_error(u'unable to extract video page URL')
3525 video_page = mobj.group(1)
3526 request = compat_urllib_request.Request(video_page)
3528 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3529 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3530 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3532 self.report_extract_vid_page(video_page)
3535 # Extract video links on video page
3536 """Extract video links of all sizes"""
3537 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3538 mobj = re.findall(pattern, webpage)
3540 self._downloader.report_error(u'unable to extract video links')
3542 # Sort in resolution
3543 links = sorted(mobj)
3545 # Choose the lowest of the sort, i.e. highest resolution
3546 video_url = links[-1]
3547 # Only get the url. The resolution part in the tuple has no use anymore
3548 video_url = video_url[-1]
3549 # Treat escaped \u0026 style hex
3551 video_url = video_url.decode("unicode_escape")
3552 except AttributeError: # Python 3
3553 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3559 'uploader': uploader,
3560 'upload_date': upload_date,
3561 'title': video_title,
3562 'ext': video_extension,
3565 class NBAIE(InfoExtractor):
3566 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3569 def _real_extract(self, url):
3570 mobj = re.match(self._VALID_URL, url)
3572 self._downloader.report_error(u'invalid URL: %s' % url)
3575 video_id = mobj.group(1)
3576 if video_id.endswith('/index.html'):
3577 video_id = video_id[:-len('/index.html')]
3579 webpage = self._download_webpage(url, video_id)
3581 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3582 def _findProp(rexp, default=None):
3583 m = re.search(rexp, webpage)
3585 return unescapeHTML(m.group(1))
3589 shortened_video_id = video_id.rpartition('/')[2]
3590 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3592 'id': shortened_video_id,
3596 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3597 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3601 class JustinTVIE(InfoExtractor):
3602 """Information extractor for justin.tv and twitch.tv"""
3603 # TODO: One broadcast may be split into multiple videos. The key
3604 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3605 # starts at 1 and increases. Can we treat all parts as one video?
3607 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3608 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3609 _JUSTIN_PAGE_LIMIT = 100
3610 IE_NAME = u'justin.tv'
3612 def report_extraction(self, file_id):
3613 """Report information extraction."""
3614 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3616 def report_download_page(self, channel, offset):
3617 """Report attempt to download a single page of videos."""
3618 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3619 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3621 # Return count of items, list of *valid* items
3622 def _parse_page(self, url):
3624 urlh = compat_urllib_request.urlopen(url)
3625 webpage_bytes = urlh.read()
3626 webpage = webpage_bytes.decode('utf-8', 'ignore')
3627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3628 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3631 response = json.loads(webpage)
3632 if type(response) != list:
3633 error_text = response.get('error', 'unknown error')
3634 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3637 for clip in response:
3638 video_url = clip['video_file_url']
3640 video_extension = os.path.splitext(video_url)[1][1:]
3641 video_date = re.sub('-', '', clip['start_time'][:10])
3642 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3643 video_id = clip['id']
3644 video_title = clip.get('title', video_id)
3648 'title': video_title,
3649 'uploader': clip.get('channel_name', video_uploader_id),
3650 'uploader_id': video_uploader_id,
3651 'upload_date': video_date,
3652 'ext': video_extension,
3654 return (len(response), info)
3656 def _real_extract(self, url):
3657 mobj = re.match(self._VALID_URL, url)
3659 self._downloader.report_error(u'invalid URL: %s' % url)
3662 api = 'http://api.justin.tv'
3663 video_id = mobj.group(mobj.lastindex)
3665 if mobj.lastindex == 1:
3667 api += '/channel/archives/%s.json'
3669 api += '/broadcast/by_archive/%s.json'
3670 api = api % (video_id,)
3672 self.report_extraction(video_id)
3676 limit = self._JUSTIN_PAGE_LIMIT
3679 self.report_download_page(video_id, offset)
3680 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3681 page_count, page_info = self._parse_page(page_url)
3682 info.extend(page_info)
3683 if not paged or page_count != limit:
3688 class FunnyOrDieIE(InfoExtractor):
3689 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3691 def _real_extract(self, url):
3692 mobj = re.match(self._VALID_URL, url)
3694 self._downloader.report_error(u'invalid URL: %s' % url)
3697 video_id = mobj.group('id')
3698 webpage = self._download_webpage(url, video_id)
3700 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3702 self._downloader.report_error(u'unable to find video information')
3703 video_url = unescapeHTML(m.group('url'))
3705 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3707 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3709 self._downloader.trouble(u'Cannot find video title')
3710 title = clean_html(m.group('title'))
3712 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3714 desc = unescapeHTML(m.group('desc'))
3723 'description': desc,
3727 class SteamIE(InfoExtractor):
3728 _VALID_URL = r"""http://store.steampowered.com/
3729 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3731 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3735 def suitable(cls, url):
3736 """Receives a URL and returns True if suitable for this IE."""
3737 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3739 def _real_extract(self, url):
3740 m = re.match(self._VALID_URL, url, re.VERBOSE)
3741 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3742 gameID = m.group('gameID')
3743 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3744 webpage = self._download_webpage(videourl, gameID)
3745 mweb = re.finditer(urlRE, webpage)
3746 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3747 titles = re.finditer(namesRE, webpage)
3748 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3749 thumbs = re.finditer(thumbsRE, webpage)
3751 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3752 video_id = vid.group('videoID')
3753 title = vtitle.group('videoName')
3754 video_url = vid.group('videoURL')
3755 video_thumb = thumb.group('thumbnail')
3757 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3762 'title': unescapeHTML(title),
3763 'thumbnail': video_thumb
3768 class UstreamIE(InfoExtractor):
3769 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3770 IE_NAME = u'ustream'
3772 def _real_extract(self, url):
3773 m = re.match(self._VALID_URL, url)
3774 video_id = m.group('videoID')
3775 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3776 webpage = self._download_webpage(url, video_id)
3777 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3778 title = m.group('title')
3779 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3780 uploader = m.group('uploader')
3786 'uploader': uploader
3790 class WorldStarHipHopIE(InfoExtractor):
3791 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3792 IE_NAME = u'WorldStarHipHop'
3794 def _real_extract(self, url):
3795 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3797 webpage_src = compat_urllib_request.urlopen(url).read()
3798 webpage_src = webpage_src.decode('utf-8')
3800 mobj = re.search(_src_url, webpage_src)
3802 m = re.match(self._VALID_URL, url)
3803 video_id = m.group('id')
3805 if mobj is not None:
3806 video_url = mobj.group()
3807 if 'mp4' in video_url:
3812 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3815 _title = r"""<title>(.*)</title>"""
3817 mobj = re.search(_title, webpage_src)
3819 if mobj is not None:
3820 title = mobj.group(1)
3822 title = 'World Start Hip Hop - %s' % time.ctime()
3824 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3825 mobj = re.search(_thumbnail, webpage_src)
3827 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3828 if mobj is not None:
3829 thumbnail = mobj.group(1)
3831 _title = r"""candytitles.*>(.*)</span>"""
3832 mobj = re.search(_title, webpage_src)
3833 if mobj is not None:
3834 title = mobj.group(1)
3841 'thumbnail' : thumbnail,
3846 class RBMARadioIE(InfoExtractor):
3847 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3849 def _real_extract(self, url):
3850 m = re.match(self._VALID_URL, url)
3851 video_id = m.group('videoID')
3853 webpage = self._download_webpage(url, video_id)
3854 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3856 raise ExtractorError(u'Cannot find metadata')
3857 json_data = m.group(1)
3860 data = json.loads(json_data)
3861 except ValueError as e:
3862 raise ExtractorError(u'Invalid JSON: ' + str(e))
3864 video_url = data['akamai_url'] + '&cbr=256'
3865 url_parts = compat_urllib_parse_urlparse(video_url)
3866 video_ext = url_parts.path.rpartition('.')[2]
3871 'title': data['title'],
3872 'description': data.get('teaser_text'),
3873 'location': data.get('country_of_origin'),
3874 'uploader': data.get('host', {}).get('name'),
3875 'uploader_id': data.get('host', {}).get('slug'),
3876 'thumbnail': data.get('image', {}).get('large_url_2x'),
3877 'duration': data.get('duration'),
3882 class YouPornIE(InfoExtractor):
3883 """Information extractor for youporn.com."""
3884 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3886 def _print_formats(self, formats):
3887 """Print all available formats"""
3888 print(u'Available formats:')
3889 print(u'ext\t\tformat')
3890 print(u'---------------------------------')
3891 for format in formats:
3892 print(u'%s\t\t%s' % (format['ext'], format['format']))
3894 def _specific(self, req_format, formats):
3896 if(x["format"]==req_format):
3900 def _real_extract(self, url):
3901 mobj = re.match(self._VALID_URL, url)
3903 self._downloader.report_error(u'invalid URL: %s' % url)
3906 video_id = mobj.group('videoid')
3908 req = compat_urllib_request.Request(url)
3909 req.add_header('Cookie', 'age_verified=1')
3910 webpage = self._download_webpage(req, video_id)
3912 # Get the video title
3913 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3915 raise ExtractorError(u'Unable to extract video title')
3916 video_title = result.group('title').strip()
3918 # Get the video date
3919 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3921 self._downloader.report_warning(u'unable to extract video date')
3924 upload_date = result.group('date').strip()
3926 # Get the video uploader
3927 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3929 self._downloader.report_warning(u'unable to extract uploader')
3930 video_uploader = None
3932 video_uploader = result.group('uploader').strip()
3933 video_uploader = clean_html( video_uploader )
3935 # Get all of the formats available
3936 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3937 result = re.search(DOWNLOAD_LIST_RE, webpage)
3939 raise ExtractorError(u'Unable to extract download list')
3940 download_list_html = result.group('download_list').strip()
3942 # Get all of the links from the page
3943 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3944 links = re.findall(LINK_RE, download_list_html)
3945 if(len(links) == 0):
3946 raise ExtractorError(u'ERROR: no known formats available for video')
3948 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3953 # A link looks like this:
3954 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3955 # A path looks like this:
3956 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3957 video_url = unescapeHTML( link )
3958 path = compat_urllib_parse_urlparse( video_url ).path
3959 extension = os.path.splitext( path )[1][1:]
3960 format = path.split('/')[4].split('_')[:2]
3963 format = "-".join( format )
3964 title = u'%s-%s-%s' % (video_title, size, bitrate)
3969 'uploader': video_uploader,
3970 'upload_date': upload_date,
3975 'description': None,
3979 if self._downloader.params.get('listformats', None):
3980 self._print_formats(formats)
3983 req_format = self._downloader.params.get('format', None)
3984 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3986 if req_format is None or req_format == 'best':
3988 elif req_format == 'worst':
3989 return [formats[-1]]
3990 elif req_format in ('-1', 'all'):
3993 format = self._specific( req_format, formats )
3995 self._downloader.report_error(u'requested format not available')
4001 class PornotubeIE(InfoExtractor):
4002 """Information extractor for pornotube.com."""
4003 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4005 def _real_extract(self, url):
4006 mobj = re.match(self._VALID_URL, url)
4008 self._downloader.report_error(u'invalid URL: %s' % url)
4011 video_id = mobj.group('videoid')
4012 video_title = mobj.group('title')
4014 # Get webpage content
4015 webpage = self._download_webpage(url, video_id)
4018 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4019 result = re.search(VIDEO_URL_RE, webpage)
4021 self._downloader.report_error(u'unable to extract video url')
4023 video_url = compat_urllib_parse.unquote(result.group('url'))
4025 #Get the uploaded date
4026 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4027 result = re.search(VIDEO_UPLOADED_RE, webpage)
4029 self._downloader.report_error(u'unable to extract video title')
4031 upload_date = result.group('date')
4033 info = {'id': video_id,
4036 'upload_date': upload_date,
4037 'title': video_title,
4043 class YouJizzIE(InfoExtractor):
4044 """Information extractor for youjizz.com."""
4045 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4047 def _real_extract(self, url):
4048 mobj = re.match(self._VALID_URL, url)
4050 self._downloader.report_error(u'invalid URL: %s' % url)
4053 video_id = mobj.group('videoid')
4055 # Get webpage content
4056 webpage = self._download_webpage(url, video_id)
4058 # Get the video title
4059 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4061 raise ExtractorError(u'ERROR: unable to extract video title')
4062 video_title = result.group('title').strip()
4064 # Get the embed page
4065 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4067 raise ExtractorError(u'ERROR: unable to extract embed page')
4069 embed_page_url = result.group(0).strip()
4070 video_id = result.group('videoid')
4072 webpage = self._download_webpage(embed_page_url, video_id)
4075 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4077 raise ExtractorError(u'ERROR: unable to extract video url')
4078 video_url = result.group('source')
4080 info = {'id': video_id,
4082 'title': video_title,
4085 'player_url': embed_page_url}
4089 class EightTracksIE(InfoExtractor):
4091 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4093 def _real_extract(self, url):
4094 mobj = re.match(self._VALID_URL, url)
4096 raise ExtractorError(u'Invalid URL: %s' % url)
4097 playlist_id = mobj.group('id')
4099 webpage = self._download_webpage(url, playlist_id)
4101 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4103 raise ExtractorError(u'Cannot find trax information')
4104 json_like = m.group(1)
4105 data = json.loads(json_like)
4107 session = str(random.randint(0, 1000000000))
4109 track_count = data['tracks_count']
4110 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4111 next_url = first_url
4113 for i in itertools.count():
4114 api_json = self._download_webpage(next_url, playlist_id,
4115 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4116 errnote=u'Failed to download song information')
4117 api_data = json.loads(api_json)
4118 track_data = api_data[u'set']['track']
4120 'id': track_data['id'],
4121 'url': track_data['track_file_stream_url'],
4122 'title': track_data['performer'] + u' - ' + track_data['name'],
4123 'raw_title': track_data['name'],
4124 'uploader_id': data['user']['login'],
4128 if api_data['set']['at_last_track']:
4130 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4133 class KeekIE(InfoExtractor):
4134 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4137 def _real_extract(self, url):
4138 m = re.match(self._VALID_URL, url)
4139 video_id = m.group('videoID')
4140 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4141 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4142 webpage = self._download_webpage(url, video_id)
4143 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4144 title = unescapeHTML(m.group('title'))
4145 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4146 uploader = clean_html(m.group('uploader'))
4152 'thumbnail': thumbnail,
4153 'uploader': uploader
4157 class TEDIE(InfoExtractor):
4158 _VALID_URL=r'''http://www.ted.com/
4160 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4162 ((?P<type_talk>talks)) # We have a simple talk
4164 /(?P<name>\w+) # Here goes the name and then ".html"
4168 def suitable(cls, url):
4169 """Receives a URL and returns True if suitable for this IE."""
4170 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4172 def _real_extract(self, url):
4173 m=re.match(self._VALID_URL, url, re.VERBOSE)
4174 if m.group('type_talk'):
4175 return [self._talk_info(url)]
4177 playlist_id=m.group('playlist_id')
4178 name=m.group('name')
4179 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4180 return self._playlist_videos_info(url,name,playlist_id)
4182 def _talk_video_link(self,mediaSlug):
4183 '''Returns the video link for that mediaSlug'''
4184 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4186 def _playlist_videos_info(self,url,name,playlist_id=0):
4187 '''Returns the videos of the playlist'''
4189 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4190 ([.\s]*?)data-playlist_item_id="(\d+)"
4191 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4193 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4194 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4195 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4196 m_names=re.finditer(video_name_RE,webpage)
4198 for m_video, m_name in zip(m_videos,m_names):
4199 video_id=m_video.group('video_id')
4200 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4201 info.append(self._talk_info(talk_url,video_id))
4204 def _talk_info(self, url, video_id=0):
4205 """Return the video for the talk in the url"""
4206 m=re.match(self._VALID_URL, url,re.VERBOSE)
4207 videoName=m.group('name')
4208 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4209 # If the url includes the language we get the title translated
4210 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4211 title=re.search(title_RE, webpage).group('title')
4212 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4213 "id":(?P<videoID>[\d]+).*?
4214 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4215 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4216 thumb_match=re.search(thumb_RE,webpage)
4217 info_match=re.search(info_RE,webpage,re.VERBOSE)
4218 video_id=info_match.group('videoID')
4219 mediaSlug=info_match.group('mediaSlug')
4220 video_url=self._talk_video_link(mediaSlug)
4226 'thumbnail': thumb_match.group('thumbnail')
4230 class MySpassIE(InfoExtractor):
4231 _VALID_URL = r'http://www.myspass.de/.*'
4233 def _real_extract(self, url):
4234 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4236 # video id is the last path element of the URL
4237 # usually there is a trailing slash, so also try the second but last
4238 url_path = compat_urllib_parse_urlparse(url).path
4239 url_parent_path, video_id = os.path.split(url_path)
4241 _, video_id = os.path.split(url_parent_path)
4244 metadata_url = META_DATA_URL_TEMPLATE % video_id
4245 metadata_text = self._download_webpage(metadata_url, video_id)
4246 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4248 # extract values from metadata
4249 url_flv_el = metadata.find('url_flv')
4250 if url_flv_el is None:
4251 self._downloader.report_error(u'unable to extract download url')
4253 video_url = url_flv_el.text
4254 extension = os.path.splitext(video_url)[1][1:]
4255 title_el = metadata.find('title')
4256 if title_el is None:
4257 self._downloader.report_error(u'unable to extract title')
4259 title = title_el.text
4260 format_id_el = metadata.find('format_id')
4261 if format_id_el is None:
4264 format = format_id_el.text
4265 description_el = metadata.find('description')
4266 if description_el is not None:
4267 description = description_el.text
4270 imagePreview_el = metadata.find('imagePreview')
4271 if imagePreview_el is not None:
4272 thumbnail = imagePreview_el.text
4281 'thumbnail': thumbnail,
4282 'description': description
4286 class SpiegelIE(InfoExtractor):
4287 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4289 def _real_extract(self, url):
4290 m = re.match(self._VALID_URL, url)
4291 video_id = m.group('videoID')
4293 webpage = self._download_webpage(url, video_id)
4294 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4296 raise ExtractorError(u'Cannot find title')
4297 video_title = unescapeHTML(m.group(1))
4299 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4300 xml_code = self._download_webpage(xml_url, video_id,
4301 note=u'Downloading XML', errnote=u'Failed to download XML')
4303 idoc = xml.etree.ElementTree.fromstring(xml_code)
4304 last_type = idoc[-1]
4305 filename = last_type.findall('./filename')[0].text
4306 duration = float(last_type.findall('./duration')[0].text)
4308 video_url = 'http://video2.spiegel.de/flash/' + filename
4309 video_ext = filename.rpartition('.')[2]
4314 'title': video_title,
4315 'duration': duration,
4319 class LiveLeakIE(InfoExtractor):
4321 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4322 IE_NAME = u'liveleak'
4324 def _real_extract(self, url):
4325 mobj = re.match(self._VALID_URL, url)
4327 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4330 video_id = mobj.group('video_id')
4332 webpage = self._download_webpage(url, video_id)
4334 m = re.search(r'file: "(.*?)",', webpage)
4336 self._downloader.report_error(u'unable to find video url')
4338 video_url = m.group(1)
4340 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4342 self._downloader.trouble(u'Cannot find video title')
4343 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4345 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4347 desc = unescapeHTML(m.group('desc'))
4351 m = re.search(r'By:.*?(\w+)</a>', webpage)
4353 uploader = clean_html(m.group(1))
4362 'description': desc,
4363 'uploader': uploader
4368 class ARDIE(InfoExtractor):
4369 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4370 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4371 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4373 def _real_extract(self, url):
4374 # determine video id from url
4375 m = re.match(self._VALID_URL, url)
4377 numid = re.search(r'documentId=([0-9]+)', url)
4379 video_id = numid.group(1)
4381 video_id = m.group('video_id')
4383 # determine title and media streams from webpage
4384 html = self._download_webpage(url, video_id)
4385 title = re.search(self._TITLE, html).group('title')
4386 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4388 assert '"fsk"' in html
4389 self._downloader.report_error(u'this video is only available after 8:00 pm')
4392 # choose default media type and highest quality for now
4393 stream = max([s for s in streams if int(s["media_type"]) == 0],
4394 key=lambda s: int(s["quality"]))
4396 # there's two possibilities: RTMP stream or HTTP download
4397 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4398 if stream['rtmp_url']:
4399 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4400 assert stream['video_url'].startswith('mp4:')
4401 info["url"] = stream["rtmp_url"]
4402 info["play_path"] = stream['video_url']
4404 assert stream["video_url"].endswith('.mp4')
4405 info["url"] = stream["video_url"]
4409 def gen_extractors():
4410 """ Return a list of an instance of every supported extractor.
4411 The order does matter; the first extractor matched is the one handling the URL.
4414 YoutubePlaylistIE(),
4439 StanfordOpenClassroomIE(),
4449 WorldStarHipHopIE(),