2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
139 class YoutubeIE(InfoExtractor):
140 """Information extractor for youtube.com."""
144 (?:https?://)? # http(s):// (optional)
145 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
147 (?:.*?\#/)? # handle anchor (#/) redirect urls
148 (?: # the various things that can precede the ID:
149 (?:(?:v|embed|e)/) # v/ or embed/ or e/
150 |(?: # or the v= param in all its forms
151 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152 (?:\?|\#!?) # the params delimiter ? or # or #!
153 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
156 )? # optional -> youtube.com/xxxx is OK
157 )? # all until now is optional -> you can pass the naked ID
158 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
159 (?(1).+)? # if we found the ID, everything can follow
161 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 _NETRC_MACHINE = 'youtube'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169 _video_extensions = {
175 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
181 _video_dimensions = {
200 def suitable(cls, url):
201 """Receives a URL and returns True if suitable for this IE."""
202 if YoutubePlaylistIE.suitable(url): return False
203 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
205 def report_lang(self):
206 """Report attempt to set language."""
207 self._downloader.to_screen(u'[youtube] Setting language')
209 def report_login(self):
210 """Report attempt to log in."""
211 self._downloader.to_screen(u'[youtube] Logging in')
213 def report_age_confirmation(self):
214 """Report attempt to confirm age."""
215 self._downloader.to_screen(u'[youtube] Confirming age')
217 def report_video_webpage_download(self, video_id):
218 """Report attempt to download video webpage."""
219 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
221 def report_video_info_webpage_download(self, video_id):
222 """Report attempt to download video info webpage."""
223 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
225 def report_video_subtitles_download(self, video_id):
226 """Report attempt to download video info webpage."""
227 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
229 def report_video_subtitles_request(self, video_id, sub_lang, format):
230 """Report attempt to download video info webpage."""
231 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
233 def report_video_subtitles_available(self, video_id, sub_lang_list):
234 """Report available subtitles."""
235 sub_lang = ",".join(list(sub_lang_list.keys()))
236 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
238 def report_information_extraction(self, video_id):
239 """Report attempt to extract video information."""
240 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
242 def report_unavailable_format(self, video_id, format):
243 """Report extracted video URL."""
244 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
246 def report_rtmp_download(self):
247 """Indicate the download will use the RTMP protocol."""
248 self._downloader.to_screen(u'[youtube] RTMP download detected')
250 def _get_available_subtitles(self, video_id):
251 self.report_video_subtitles_download(video_id)
252 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
254 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256 return (u'unable to download video subtitles: %s' % compat_str(err), None)
257 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259 if not sub_lang_list:
260 return (u'video doesn\'t have subtitles', None)
263 def _list_available_subtitles(self, video_id):
264 sub_lang_list = self._get_available_subtitles(video_id)
265 self.report_video_subtitles_available(video_id, sub_lang_list)
267 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
270 (error_message, sub_lang, sub)
272 self.report_video_subtitles_request(video_id, sub_lang, format)
273 params = compat_urllib_parse.urlencode({
279 url = 'http://www.youtube.com/api/timedtext?' + params
281 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
282 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
283 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
285 return (u'Did not fetch video subtitles', None, None)
286 return (None, sub_lang, sub)
288 def _extract_subtitle(self, video_id):
290 Return a list with a tuple:
291 [(error_message, sub_lang, sub)]
293 sub_lang_list = self._get_available_subtitles(video_id)
294 sub_format = self._downloader.params.get('subtitlesformat')
295 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
296 return [(sub_lang_list[0], None, None)]
297 if self._downloader.params.get('subtitleslang', False):
298 sub_lang = self._downloader.params.get('subtitleslang')
299 elif 'en' in sub_lang_list:
302 sub_lang = list(sub_lang_list.keys())[0]
303 if not sub_lang in sub_lang_list:
304 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
306 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
309 def _extract_all_subtitles(self, video_id):
310 sub_lang_list = self._get_available_subtitles(video_id)
311 sub_format = self._downloader.params.get('subtitlesformat')
312 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
313 return [(sub_lang_list[0], None, None)]
315 for sub_lang in sub_lang_list:
316 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
317 subtitles.append(subtitle)
320 def _print_formats(self, formats):
321 print('Available formats:')
323 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
325 def _real_initialize(self):
326 if self._downloader is None:
331 downloader_params = self._downloader.params
333 # Attempt to use provided username and password or .netrc data
334 if downloader_params.get('username', None) is not None:
335 username = downloader_params['username']
336 password = downloader_params['password']
337 elif downloader_params.get('usenetrc', False):
339 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
344 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
345 except (IOError, netrc.NetrcParseError) as err:
346 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
350 request = compat_urllib_request.Request(self._LANG_URL)
353 compat_urllib_request.urlopen(request).read()
354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
355 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
358 # No authentication to be performed
362 request = compat_urllib_request.Request(self._LOGIN_URL)
364 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
366 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
371 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
373 galx = match.group(1)
375 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
381 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
385 u'PersistentCookie': u'yes',
387 u'bgresponse': u'js_disabled',
388 u'checkConnection': u'',
389 u'checkedDomains': u'youtube',
395 u'signIn': u'Sign in',
397 u'service': u'youtube',
401 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
403 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
404 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
405 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
408 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
410 self._downloader.report_warning(u'unable to log in: bad username or password')
412 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
413 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
419 'action_confirm': 'Confirm',
421 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
423 self.report_age_confirmation()
424 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
425 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
426 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
429 def _extract_id(self, url):
430 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
432 self._downloader.report_error(u'invalid URL: %s' % url)
434 video_id = mobj.group(2)
437 def _real_extract(self, url):
438 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
439 mobj = re.search(self._NEXT_URL_RE, url)
441 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
442 video_id = self._extract_id(url)
445 self.report_video_webpage_download(video_id)
446 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
447 request = compat_urllib_request.Request(url)
449 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
451 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
454 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
456 # Attempt to extract SWF player URL
457 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
459 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
464 self.report_video_info_webpage_download(video_id)
465 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
466 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
467 % (video_id, el_type))
468 request = compat_urllib_request.Request(video_info_url)
470 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
471 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
472 video_info = compat_parse_qs(video_info_webpage)
473 if 'token' in video_info:
475 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
476 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
478 if 'token' not in video_info:
479 if 'reason' in video_info:
480 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
482 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
485 # Check for "rental" videos
486 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
487 self._downloader.report_error(u'"rental" videos not supported')
490 # Start extracting information
491 self.report_information_extraction(video_id)
494 if 'author' not in video_info:
495 self._downloader.report_error(u'unable to extract uploader name')
497 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
500 video_uploader_id = None
501 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
503 video_uploader_id = mobj.group(1)
505 self._downloader.report_warning(u'unable to extract uploader nickname')
508 if 'title' not in video_info:
509 self._downloader.report_error(u'unable to extract video title')
511 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
514 if 'thumbnail_url' not in video_info:
515 self._downloader.report_warning(u'unable to extract video thumbnail')
517 else: # don't panic if we can't find it
518 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
522 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
524 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
525 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
526 for expression in format_expressions:
528 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
533 video_description = get_element_by_id("eow-description", video_webpage)
534 if video_description:
535 video_description = clean_html(video_description)
537 video_description = ''
540 video_subtitles = None
542 if self._downloader.params.get('writesubtitles', False):
543 video_subtitles = self._extract_subtitle(video_id)
545 (sub_error, sub_lang, sub) = video_subtitles[0]
547 self._downloader.report_error(sub_error)
549 if self._downloader.params.get('allsubtitles', False):
550 video_subtitles = self._extract_all_subtitles(video_id)
551 for video_subtitle in video_subtitles:
552 (sub_error, sub_lang, sub) = video_subtitle
554 self._downloader.report_error(sub_error)
556 if self._downloader.params.get('listsubtitles', False):
557 sub_lang_list = self._list_available_subtitles(video_id)
560 if 'length_seconds' not in video_info:
561 self._downloader.report_warning(u'unable to extract video duration')
564 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
567 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
569 # Decide which formats to download
570 req_format = self._downloader.params.get('format', None)
572 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
573 self.report_rtmp_download()
574 video_url_list = [(None, video_info['conn'][0])]
575 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
576 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
577 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
578 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
579 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
581 format_limit = self._downloader.params.get('format_limit', None)
582 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
583 if format_limit is not None and format_limit in available_formats:
584 format_list = available_formats[available_formats.index(format_limit):]
586 format_list = available_formats
587 existing_formats = [x for x in format_list if x in url_map]
588 if len(existing_formats) == 0:
589 self._downloader.report_error(u'no known formats available for video')
591 if self._downloader.params.get('listformats', None):
592 self._print_formats(existing_formats)
594 if req_format is None or req_format == 'best':
595 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
596 elif req_format == 'worst':
597 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
598 elif req_format in ('-1', 'all'):
599 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
601 # Specific formats. We pick the first in a slash-delimeted sequence.
602 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
603 req_formats = req_format.split('/')
604 video_url_list = None
605 for rf in req_formats:
607 video_url_list = [(rf, url_map[rf])]
609 if video_url_list is None:
610 self._downloader.report_error(u'requested format not available')
613 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
617 for format_param, video_real_url in video_url_list:
619 video_extension = self._video_extensions.get(format_param, 'flv')
621 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
622 self._video_dimensions.get(format_param, '???'))
626 'url': video_real_url,
627 'uploader': video_uploader,
628 'uploader_id': video_uploader_id,
629 'upload_date': upload_date,
630 'title': video_title,
631 'ext': video_extension,
632 'format': video_format,
633 'thumbnail': video_thumbnail,
634 'description': video_description,
635 'player_url': player_url,
636 'subtitles': video_subtitles,
637 'duration': video_duration
642 class MetacafeIE(InfoExtractor):
643 """Information Extractor for metacafe.com."""
645 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
646 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
647 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
648 IE_NAME = u'metacafe'
650 def __init__(self, downloader=None):
651 InfoExtractor.__init__(self, downloader)
653 def report_disclaimer(self):
654 """Report disclaimer retrieval."""
655 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
657 def report_age_confirmation(self):
658 """Report attempt to confirm age."""
659 self._downloader.to_screen(u'[metacafe] Confirming age')
661 def report_download_webpage(self, video_id):
662 """Report webpage download."""
663 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
665 def report_extraction(self, video_id):
666 """Report information extraction."""
667 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
669 def _real_initialize(self):
670 # Retrieve disclaimer
671 request = compat_urllib_request.Request(self._DISCLAIMER)
673 self.report_disclaimer()
674 disclaimer = compat_urllib_request.urlopen(request).read()
675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
676 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
682 'submit': "Continue - I'm over 18",
684 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
686 self.report_age_confirmation()
687 disclaimer = compat_urllib_request.urlopen(request).read()
688 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
689 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
692 def _real_extract(self, url):
693 # Extract id and simplified title from URL
694 mobj = re.match(self._VALID_URL, url)
696 self._downloader.report_error(u'invalid URL: %s' % url)
699 video_id = mobj.group(1)
701 # Check if video comes from YouTube
702 mobj2 = re.match(r'^yt-(.*)$', video_id)
703 if mobj2 is not None:
704 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
707 # Retrieve video webpage to extract further information
708 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
710 self.report_download_webpage(video_id)
711 webpage = compat_urllib_request.urlopen(request).read()
712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
713 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
716 # Extract URL, uploader and title from webpage
717 self.report_extraction(video_id)
718 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
720 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
721 video_extension = mediaURL[-3:]
723 # Extract gdaKey if available
724 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
728 gdaKey = mobj.group(1)
729 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
731 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
733 self._downloader.report_error(u'unable to extract media URL')
735 vardict = compat_parse_qs(mobj.group(1))
736 if 'mediaData' not in vardict:
737 self._downloader.report_error(u'unable to extract media URL')
739 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
741 self._downloader.report_error(u'unable to extract media URL')
743 mediaURL = mobj.group(1).replace('\\/', '/')
744 video_extension = mediaURL[-3:]
745 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
747 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
749 self._downloader.report_error(u'unable to extract title')
751 video_title = mobj.group(1).decode('utf-8')
753 mobj = re.search(r'submitter=(.*?);', webpage)
755 self._downloader.report_error(u'unable to extract uploader nickname')
757 video_uploader = mobj.group(1)
760 'id': video_id.decode('utf-8'),
761 'url': video_url.decode('utf-8'),
762 'uploader': video_uploader.decode('utf-8'),
764 'title': video_title,
765 'ext': video_extension.decode('utf-8'),
769 class DailymotionIE(InfoExtractor):
770 """Information Extractor for Dailymotion"""
772 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
773 IE_NAME = u'dailymotion'
776 def __init__(self, downloader=None):
777 InfoExtractor.__init__(self, downloader)
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
783 def _real_extract(self, url):
784 # Extract id and simplified title from URL
785 mobj = re.match(self._VALID_URL, url)
787 self._downloader.report_error(u'invalid URL: %s' % url)
790 video_id = mobj.group(1).split('_')[0].split('?')[0]
792 video_extension = 'mp4'
794 # Retrieve video webpage to extract further information
795 request = compat_urllib_request.Request(url)
796 request.add_header('Cookie', 'family_filter=off')
797 webpage = self._download_webpage(request, video_id)
799 # Extract URL, uploader and title from webpage
800 self.report_extraction(video_id)
801 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
803 self._downloader.report_error(u'unable to extract media URL')
805 flashvars = compat_urllib_parse.unquote(mobj.group(1))
807 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
810 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
813 self._downloader.report_error(u'unable to extract video URL')
816 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
818 self._downloader.report_error(u'unable to extract video URL')
821 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
823 # TODO: support choosing qualities
825 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
827 self._downloader.report_error(u'unable to extract title')
829 video_title = unescapeHTML(mobj.group('title'))
831 video_uploader = None
832 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
834 # lookin for official user
835 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
836 if mobj_official is None:
837 self._downloader.report_warning(u'unable to extract uploader nickname')
839 video_uploader = mobj_official.group(1)
841 video_uploader = mobj.group(1)
843 video_upload_date = None
844 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
846 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
851 'uploader': video_uploader,
852 'upload_date': video_upload_date,
853 'title': video_title,
854 'ext': video_extension,
858 class PhotobucketIE(InfoExtractor):
859 """Information extractor for photobucket.com."""
861 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
862 IE_NAME = u'photobucket'
864 def __init__(self, downloader=None):
865 InfoExtractor.__init__(self, downloader)
867 def report_download_webpage(self, video_id):
868 """Report webpage download."""
869 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
871 def report_extraction(self, video_id):
872 """Report information extraction."""
873 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
875 def _real_extract(self, url):
876 # Extract id from URL
877 mobj = re.match(self._VALID_URL, url)
879 self._downloader.report_error(u'Invalid URL: %s' % url)
882 video_id = mobj.group(1)
884 video_extension = 'flv'
886 # Retrieve video webpage to extract further information
887 request = compat_urllib_request.Request(url)
889 self.report_download_webpage(video_id)
890 webpage = compat_urllib_request.urlopen(request).read()
891 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
892 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
895 # Extract URL, uploader, and title from webpage
896 self.report_extraction(video_id)
897 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
899 self._downloader.report_error(u'unable to extract media URL')
901 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
905 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
907 self._downloader.report_error(u'unable to extract title')
909 video_title = mobj.group(1).decode('utf-8')
911 video_uploader = mobj.group(2).decode('utf-8')
914 'id': video_id.decode('utf-8'),
915 'url': video_url.decode('utf-8'),
916 'uploader': video_uploader,
918 'title': video_title,
919 'ext': video_extension.decode('utf-8'),
923 class YahooIE(InfoExtractor):
924 """Information extractor for video.yahoo.com."""
927 # _VALID_URL matches all Yahoo! Video URLs
928 # _VPAGE_URL matches only the extractable '/watch/' URLs
929 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
930 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
931 IE_NAME = u'video.yahoo'
933 def __init__(self, downloader=None):
934 InfoExtractor.__init__(self, downloader)
936 def report_download_webpage(self, video_id):
937 """Report webpage download."""
938 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
940 def report_extraction(self, video_id):
941 """Report information extraction."""
942 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
944 def _real_extract(self, url, new_video=True):
945 # Extract ID from URL
946 mobj = re.match(self._VALID_URL, url)
948 self._downloader.report_error(u'Invalid URL: %s' % url)
951 video_id = mobj.group(2)
952 video_extension = 'flv'
954 # Rewrite valid but non-extractable URLs as
955 # extractable English language /watch/ URLs
956 if re.match(self._VPAGE_URL, url) is None:
957 request = compat_urllib_request.Request(url)
959 webpage = compat_urllib_request.urlopen(request).read()
960 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
961 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
964 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
966 self._downloader.report_error(u'Unable to extract id field')
968 yahoo_id = mobj.group(1)
970 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
972 self._downloader.report_error(u'Unable to extract vid field')
974 yahoo_vid = mobj.group(1)
976 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
977 return self._real_extract(url, new_video=False)
979 # Retrieve video webpage to extract further information
980 request = compat_urllib_request.Request(url)
982 self.report_download_webpage(video_id)
983 webpage = compat_urllib_request.urlopen(request).read()
984 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
985 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
988 # Extract uploader and title from webpage
989 self.report_extraction(video_id)
990 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
992 self._downloader.report_error(u'unable to extract video title')
994 video_title = mobj.group(1).decode('utf-8')
996 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
998 self._downloader.report_error(u'unable to extract video uploader')
1000 video_uploader = mobj.group(1).decode('utf-8')
1002 # Extract video thumbnail
1003 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1005 self._downloader.report_error(u'unable to extract video thumbnail')
1007 video_thumbnail = mobj.group(1).decode('utf-8')
1009 # Extract video description
1010 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1012 self._downloader.report_error(u'unable to extract video description')
1014 video_description = mobj.group(1).decode('utf-8')
1015 if not video_description:
1016 video_description = 'No description available.'
1018 # Extract video height and width
1019 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1021 self._downloader.report_error(u'unable to extract video height')
1023 yv_video_height = mobj.group(1)
1025 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1027 self._downloader.report_error(u'unable to extract video width')
1029 yv_video_width = mobj.group(1)
1031 # Retrieve video playlist to extract media URL
1032 # I'm not completely sure what all these options are, but we
1033 # seem to need most of them, otherwise the server sends a 401.
1034 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1035 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1036 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1037 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1038 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1040 self.report_download_webpage(video_id)
1041 webpage = compat_urllib_request.urlopen(request).read()
1042 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1043 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1046 # Extract media URL from playlist XML
1047 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1049 self._downloader.report_error(u'Unable to extract media URL')
1051 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1052 video_url = unescapeHTML(video_url)
1055 'id': video_id.decode('utf-8'),
1057 'uploader': video_uploader,
1058 'upload_date': None,
1059 'title': video_title,
1060 'ext': video_extension.decode('utf-8'),
1061 'thumbnail': video_thumbnail.decode('utf-8'),
1062 'description': video_description,
1066 class VimeoIE(InfoExtractor):
1067 """Information extractor for vimeo.com."""
1069 # _VALID_URL matches Vimeo URLs
1070 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1073 def __init__(self, downloader=None):
1074 InfoExtractor.__init__(self, downloader)
1076 def report_download_webpage(self, video_id):
1077 """Report webpage download."""
1078 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1080 def report_extraction(self, video_id):
1081 """Report information extraction."""
1082 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1084 def _real_extract(self, url, new_video=True):
1085 # Extract ID from URL
1086 mobj = re.match(self._VALID_URL, url)
1088 self._downloader.report_error(u'Invalid URL: %s' % url)
1091 video_id = mobj.group('id')
1092 if not mobj.group('proto'):
1093 url = 'https://' + url
1094 if mobj.group('direct_link'):
1095 url = 'https://vimeo.com/' + video_id
1097 # Retrieve video webpage to extract further information
1098 request = compat_urllib_request.Request(url, None, std_headers)
1100 self.report_download_webpage(video_id)
1101 webpage_bytes = compat_urllib_request.urlopen(request).read()
1102 webpage = webpage_bytes.decode('utf-8')
1103 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1104 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1107 # Now we begin extracting as much information as we can from what we
1108 # retrieved. First we extract the information common to all extractors,
1109 # and latter we extract those that are Vimeo specific.
1110 self.report_extraction(video_id)
1112 # Extract the config JSON
1114 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1115 config = json.loads(config)
1117 self._downloader.report_error(u'unable to extract info section')
1121 video_title = config["video"]["title"]
1123 # Extract uploader and uploader_id
1124 video_uploader = config["video"]["owner"]["name"]
1125 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1127 # Extract video thumbnail
1128 video_thumbnail = config["video"]["thumbnail"]
1130 # Extract video description
1131 video_description = get_element_by_attribute("itemprop", "description", webpage)
1132 if video_description: video_description = clean_html(video_description)
1133 else: video_description = ''
1135 # Extract upload date
1136 video_upload_date = None
1137 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1138 if mobj is not None:
1139 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1141 # Vimeo specific: extract request signature and timestamp
1142 sig = config['request']['signature']
1143 timestamp = config['request']['timestamp']
1145 # Vimeo specific: extract video codec and quality information
1146 # First consider quality, then codecs, then take everything
1147 # TODO bind to format param
1148 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1149 files = { 'hd': [], 'sd': [], 'other': []}
1150 for codec_name, codec_extension in codecs:
1151 if codec_name in config["video"]["files"]:
1152 if 'hd' in config["video"]["files"][codec_name]:
1153 files['hd'].append((codec_name, codec_extension, 'hd'))
1154 elif 'sd' in config["video"]["files"][codec_name]:
1155 files['sd'].append((codec_name, codec_extension, 'sd'))
1157 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1159 for quality in ('hd', 'sd', 'other'):
1160 if len(files[quality]) > 0:
1161 video_quality = files[quality][0][2]
1162 video_codec = files[quality][0][0]
1163 video_extension = files[quality][0][1]
1164 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1167 self._downloader.report_error(u'no known codec found')
1170 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1171 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1176 'uploader': video_uploader,
1177 'uploader_id': video_uploader_id,
1178 'upload_date': video_upload_date,
1179 'title': video_title,
1180 'ext': video_extension,
1181 'thumbnail': video_thumbnail,
1182 'description': video_description,
1186 class ArteTvIE(InfoExtractor):
1187 """arte.tv information extractor."""
1189 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1190 _LIVE_URL = r'index-[0-9]+\.html$'
1192 IE_NAME = u'arte.tv'
1194 def __init__(self, downloader=None):
1195 InfoExtractor.__init__(self, downloader)
1197 def report_download_webpage(self, video_id):
1198 """Report webpage download."""
1199 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1201 def report_extraction(self, video_id):
1202 """Report information extraction."""
1203 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1205 def fetch_webpage(self, url):
1206 request = compat_urllib_request.Request(url)
1208 self.report_download_webpage(url)
1209 webpage = compat_urllib_request.urlopen(request).read()
1210 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1211 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213 except ValueError as err:
1214 self._downloader.report_error(u'Invalid URL: %s' % url)
1218 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1219 page = self.fetch_webpage(url)
1220 mobj = re.search(regex, page, regexFlags)
1224 self._downloader.report_error(u'Invalid URL: %s' % url)
1227 for (i, key, err) in matchTuples:
1228 if mobj.group(i) is None:
1229 self._downloader.trouble(err)
1232 info[key] = mobj.group(i)
1236 def extractLiveStream(self, url):
1237 video_lang = url.split('/')[-4]
1238 info = self.grep_webpage(
1240 r'src="(.*?/videothek_js.*?\.js)',
1243 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1246 http_host = url.split('/')[2]
1247 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1248 info = self.grep_webpage(
1250 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1251 '(http://.*?\.swf).*?' +
1255 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1256 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1257 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1260 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1262 def extractPlus7Stream(self, url):
1263 video_lang = url.split('/')[-3]
1264 info = self.grep_webpage(
1266 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1269 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1272 next_url = compat_urllib_parse.unquote(info.get('url'))
1273 info = self.grep_webpage(
1275 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1278 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1281 next_url = compat_urllib_parse.unquote(info.get('url'))
1283 info = self.grep_webpage(
1285 r'<video id="(.*?)".*?>.*?' +
1286 '<name>(.*?)</name>.*?' +
1287 '<dateVideo>(.*?)</dateVideo>.*?' +
1288 '<url quality="hd">(.*?)</url>',
1291 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1292 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1293 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1294 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1299 'id': info.get('id'),
1300 'url': compat_urllib_parse.unquote(info.get('url')),
1301 'uploader': u'arte.tv',
1302 'upload_date': info.get('date'),
1303 'title': info.get('title').decode('utf-8'),
1309 def _real_extract(self, url):
1310 video_id = url.split('/')[-1]
1311 self.report_extraction(video_id)
1313 if re.search(self._LIVE_URL, video_id) is not None:
1314 self.extractLiveStream(url)
1317 info = self.extractPlus7Stream(url)
1322 class GenericIE(InfoExtractor):
1323 """Generic last-resort information extractor."""
1326 IE_NAME = u'generic'
1328 def __init__(self, downloader=None):
1329 InfoExtractor.__init__(self, downloader)
1331 def report_download_webpage(self, video_id):
1332 """Report webpage download."""
1333 if not self._downloader.params.get('test', False):
1334 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1335 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1337 def report_extraction(self, video_id):
1338 """Report information extraction."""
1339 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1341 def report_following_redirect(self, new_url):
1342 """Report information extraction."""
1343 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1345 def _test_redirect(self, url):
1346 """Check if it is a redirect, like url shorteners, in case restart chain."""
1347 class HeadRequest(compat_urllib_request.Request):
1348 def get_method(self):
1351 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1353 Subclass the HTTPRedirectHandler to make it use our
1354 HeadRequest also on the redirected URL
1356 def redirect_request(self, req, fp, code, msg, headers, newurl):
1357 if code in (301, 302, 303, 307):
1358 newurl = newurl.replace(' ', '%20')
1359 newheaders = dict((k,v) for k,v in req.headers.items()
1360 if k.lower() not in ("content-length", "content-type"))
1361 return HeadRequest(newurl,
1363 origin_req_host=req.get_origin_req_host(),
1366 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1368 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1370 Fallback to GET if HEAD is not allowed (405 HTTP error)
1372 def http_error_405(self, req, fp, code, msg, headers):
1376 newheaders = dict((k,v) for k,v in req.headers.items()
1377 if k.lower() not in ("content-length", "content-type"))
1378 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1380 origin_req_host=req.get_origin_req_host(),
1384 opener = compat_urllib_request.OpenerDirector()
1385 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1386 HTTPMethodFallback, HEADRedirectHandler,
1387 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1388 opener.add_handler(handler())
1390 response = opener.open(HeadRequest(url))
1391 new_url = response.geturl()
1396 self.report_following_redirect(new_url)
1397 self._downloader.download([new_url])
1400 def _real_extract(self, url):
1401 if self._test_redirect(url): return
1403 video_id = url.split('/')[-1]
1405 webpage = self._download_webpage(url, video_id)
1406 except ValueError as err:
1407 # since this is the last-resort InfoExtractor, if
1408 # this error is thrown, it'll be thrown here
1409 self._downloader.report_error(u'Invalid URL: %s' % url)
1412 self.report_extraction(video_id)
1413 # Start with something easy: JW Player in SWFObject
1414 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1416 # Broaden the search a little bit
1417 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1419 # Broaden the search a little bit: JWPlayer JS loader
1420 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1422 self._downloader.report_error(u'Invalid URL: %s' % url)
1425 # It's possible that one of the regexes
1426 # matched, but returned an empty group:
1427 if mobj.group(1) is None:
1428 self._downloader.report_error(u'Invalid URL: %s' % url)
1431 video_url = compat_urllib_parse.unquote(mobj.group(1))
1432 video_id = os.path.basename(video_url)
1434 # here's a fun little line of code for you:
1435 video_extension = os.path.splitext(video_id)[1][1:]
1436 video_id = os.path.splitext(video_id)[0]
1438 # it's tempting to parse this further, but you would
1439 # have to take into account all the variations like
1440 # Video Title - Site Name
1441 # Site Name | Video Title
1442 # Video Title - Tagline | Site Name
1443 # and so on and so forth; it's just not practical
1444 mobj = re.search(r'<title>(.*)</title>', webpage)
1446 self._downloader.report_error(u'unable to extract title')
1448 video_title = mobj.group(1)
1450 # video uploader is domain name
1451 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1453 self._downloader.report_error(u'unable to extract title')
1455 video_uploader = mobj.group(1)
1460 'uploader': video_uploader,
1461 'upload_date': None,
1462 'title': video_title,
1463 'ext': video_extension,
1467 class YoutubeSearchIE(InfoExtractor):
1468 """Information Extractor for YouTube search queries."""
1469 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1470 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1471 _max_youtube_results = 1000
1472 IE_NAME = u'youtube:search'
1474 def __init__(self, downloader=None):
1475 InfoExtractor.__init__(self, downloader)
1477 def report_download_page(self, query, pagenum):
1478 """Report attempt to download search page with given number."""
1479 query = query.decode(preferredencoding())
1480 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1482 def _real_extract(self, query):
1483 mobj = re.match(self._VALID_URL, query)
1485 self._downloader.report_error(u'invalid search query "%s"' % query)
1488 prefix, query = query.split(':')
1490 query = query.encode('utf-8')
1492 self._download_n_results(query, 1)
1494 elif prefix == 'all':
1495 self._download_n_results(query, self._max_youtube_results)
1501 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1503 elif n > self._max_youtube_results:
1504 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1505 n = self._max_youtube_results
1506 self._download_n_results(query, n)
1508 except ValueError: # parsing prefix as integer fails
1509 self._download_n_results(query, 1)
1512 def _download_n_results(self, query, n):
1513 """Downloads a specified number of results for a query"""
1519 while (50 * pagenum) < limit:
1520 self.report_download_page(query, pagenum+1)
1521 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1522 request = compat_urllib_request.Request(result_url)
1524 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1526 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1528 api_response = json.loads(data)['data']
1530 if not 'items' in api_response:
1531 self._downloader.trouble(u'[youtube] No video results')
1534 new_ids = list(video['id'] for video in api_response['items'])
1535 video_ids += new_ids
1537 limit = min(n, api_response['totalItems'])
1540 if len(video_ids) > n:
1541 video_ids = video_ids[:n]
1542 for id in video_ids:
1543 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1547 class GoogleSearchIE(InfoExtractor):
1548 """Information Extractor for Google Video search queries."""
1549 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1550 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1551 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1552 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1553 _max_google_results = 1000
1554 IE_NAME = u'video.google:search'
1556 def __init__(self, downloader=None):
1557 InfoExtractor.__init__(self, downloader)
1559 def report_download_page(self, query, pagenum):
1560 """Report attempt to download playlist page with given number."""
1561 query = query.decode(preferredencoding())
1562 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1564 def _real_extract(self, query):
1565 mobj = re.match(self._VALID_URL, query)
1567 self._downloader.report_error(u'invalid search query "%s"' % query)
1570 prefix, query = query.split(':')
1572 query = query.encode('utf-8')
1574 self._download_n_results(query, 1)
1576 elif prefix == 'all':
1577 self._download_n_results(query, self._max_google_results)
1583 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1585 elif n > self._max_google_results:
1586 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1587 n = self._max_google_results
1588 self._download_n_results(query, n)
1590 except ValueError: # parsing prefix as integer fails
1591 self._download_n_results(query, 1)
1594 def _download_n_results(self, query, n):
1595 """Downloads a specified number of results for a query"""
1601 self.report_download_page(query, pagenum)
1602 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1603 request = compat_urllib_request.Request(result_url)
1605 page = compat_urllib_request.urlopen(request).read()
1606 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1607 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1610 # Extract video identifiers
1611 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1612 video_id = mobj.group(1)
1613 if video_id not in video_ids:
1614 video_ids.append(video_id)
1615 if len(video_ids) == n:
1616 # Specified n videos reached
1617 for id in video_ids:
1618 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1621 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1622 for id in video_ids:
1623 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1626 pagenum = pagenum + 1
1629 class YahooSearchIE(InfoExtractor):
1630 """Information Extractor for Yahoo! Video search queries."""
1633 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1634 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1635 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1636 _MORE_PAGES_INDICATOR = r'\s*Next'
1637 _max_yahoo_results = 1000
1638 IE_NAME = u'video.yahoo:search'
1640 def __init__(self, downloader=None):
1641 InfoExtractor.__init__(self, downloader)
1643 def report_download_page(self, query, pagenum):
1644 """Report attempt to download playlist page with given number."""
1645 query = query.decode(preferredencoding())
1646 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1648 def _real_extract(self, query):
1649 mobj = re.match(self._VALID_URL, query)
1651 self._downloader.report_error(u'invalid search query "%s"' % query)
1654 prefix, query = query.split(':')
1656 query = query.encode('utf-8')
1658 self._download_n_results(query, 1)
1660 elif prefix == 'all':
1661 self._download_n_results(query, self._max_yahoo_results)
1667 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1669 elif n > self._max_yahoo_results:
1670 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1671 n = self._max_yahoo_results
1672 self._download_n_results(query, n)
1674 except ValueError: # parsing prefix as integer fails
1675 self._download_n_results(query, 1)
1678 def _download_n_results(self, query, n):
1679 """Downloads a specified number of results for a query"""
1682 already_seen = set()
1686 self.report_download_page(query, pagenum)
1687 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1688 request = compat_urllib_request.Request(result_url)
1690 page = compat_urllib_request.urlopen(request).read()
1691 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1692 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1695 # Extract video identifiers
1696 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1697 video_id = mobj.group(1)
1698 if video_id not in already_seen:
1699 video_ids.append(video_id)
1700 already_seen.add(video_id)
1701 if len(video_ids) == n:
1702 # Specified n videos reached
1703 for id in video_ids:
1704 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1707 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1708 for id in video_ids:
1709 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1712 pagenum = pagenum + 1
1715 class YoutubePlaylistIE(InfoExtractor):
1716 """Information Extractor for YouTube playlists."""
1718 _VALID_URL = r"""(?:
1723 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1724 \? (?:.*?&)*? (?:p|a|list)=
1729 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1732 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1734 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1736 IE_NAME = u'youtube:playlist'
1738 def __init__(self, downloader=None):
1739 InfoExtractor.__init__(self, downloader)
1742 def suitable(cls, url):
1743 """Receives a URL and returns True if suitable for this IE."""
1744 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1746 def report_download_page(self, playlist_id, pagenum):
1747 """Report attempt to download playlist page with given number."""
1748 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1750 def _real_extract(self, url):
1751 # Extract playlist id
1752 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1754 self._downloader.report_error(u'invalid url: %s' % url)
1757 # Download playlist videos from API
1758 playlist_id = mobj.group(1) or mobj.group(2)
1763 self.report_download_page(playlist_id, page_num)
1765 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1767 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1768 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1773 response = json.loads(page)
1774 except ValueError as err:
1775 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1778 if not 'feed' in response or not 'entry' in response['feed']:
1779 self._downloader.report_error(u'Got a malformed response from YouTube API')
1781 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1782 for entry in response['feed']['entry']
1783 if 'content' in entry ]
1785 if len(response['feed']['entry']) < self._MAX_RESULTS:
1789 videos = [v[1] for v in sorted(videos)]
1792 playliststart = self._downloader.params.get('playliststart', 1) - 1
1793 playlistend = self._downloader.params.get('playlistend', -1)
1794 if playlistend == -1:
1795 videos = videos[playliststart:]
1797 videos = videos[playliststart:playlistend]
1799 if len(videos) == total:
1800 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1802 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1804 for video in videos:
1805 self._downloader.download([video])
1809 class YoutubeChannelIE(InfoExtractor):
1810 """Information Extractor for YouTube channels."""
1812 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1813 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1814 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1815 IE_NAME = u'youtube:channel'
1817 def report_download_page(self, channel_id, pagenum):
1818 """Report attempt to download channel page with given number."""
1819 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1821 def _real_extract(self, url):
1822 # Extract channel id
1823 mobj = re.match(self._VALID_URL, url)
1825 self._downloader.report_error(u'invalid url: %s' % url)
1828 # Download channel pages
1829 channel_id = mobj.group(1)
1834 self.report_download_page(channel_id, pagenum)
1835 url = self._TEMPLATE_URL % (channel_id, pagenum)
1836 request = compat_urllib_request.Request(url)
1838 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1839 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1840 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1843 # Extract video identifiers
1845 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1846 if mobj.group(1) not in ids_in_page:
1847 ids_in_page.append(mobj.group(1))
1848 video_ids.extend(ids_in_page)
1850 if self._MORE_PAGES_INDICATOR not in page:
1852 pagenum = pagenum + 1
1854 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1856 for id in video_ids:
1857 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1861 class YoutubeUserIE(InfoExtractor):
1862 """Information Extractor for YouTube users."""
1864 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1865 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1866 _GDATA_PAGE_SIZE = 50
1867 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1868 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1869 IE_NAME = u'youtube:user'
1871 def __init__(self, downloader=None):
1872 InfoExtractor.__init__(self, downloader)
1874 def report_download_page(self, username, start_index):
1875 """Report attempt to download user page."""
1876 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1877 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1879 def _real_extract(self, url):
1881 mobj = re.match(self._VALID_URL, url)
1883 self._downloader.report_error(u'invalid url: %s' % url)
1886 username = mobj.group(1)
1888 # Download video ids using YouTube Data API. Result size per
1889 # query is limited (currently to 50 videos) so we need to query
1890 # page by page until there are no video ids - it means we got
1897 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1898 self.report_download_page(username, start_index)
1900 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1903 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1904 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1905 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1908 # Extract video identifiers
1911 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1912 if mobj.group(1) not in ids_in_page:
1913 ids_in_page.append(mobj.group(1))
1915 video_ids.extend(ids_in_page)
1917 # A little optimization - if current page is not
1918 # "full", ie. does not contain PAGE_SIZE video ids then
1919 # we can assume that this page is the last one - there
1920 # are no more ids on further pages - no need to query
1923 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1928 all_ids_count = len(video_ids)
1929 playliststart = self._downloader.params.get('playliststart', 1) - 1
1930 playlistend = self._downloader.params.get('playlistend', -1)
1932 if playlistend == -1:
1933 video_ids = video_ids[playliststart:]
1935 video_ids = video_ids[playliststart:playlistend]
1937 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1938 (username, all_ids_count, len(video_ids)))
1940 for video_id in video_ids:
1941 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1944 class BlipTVUserIE(InfoExtractor):
1945 """Information Extractor for blip.tv users."""
1947 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1949 IE_NAME = u'blip.tv:user'
1951 def __init__(self, downloader=None):
1952 InfoExtractor.__init__(self, downloader)
1954 def report_download_page(self, username, pagenum):
1955 """Report attempt to download user page."""
1956 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1957 (self.IE_NAME, username, pagenum))
1959 def _real_extract(self, url):
1961 mobj = re.match(self._VALID_URL, url)
1963 self._downloader.report_error(u'invalid url: %s' % url)
1966 username = mobj.group(1)
1968 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1970 request = compat_urllib_request.Request(url)
1973 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1974 mobj = re.search(r'data-users-id="([^"]+)"', page)
1975 page_base = page_base % mobj.group(1)
1976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1977 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1981 # Download video ids using BlipTV Ajax calls. Result size per
1982 # query is limited (currently to 12 videos) so we need to query
1983 # page by page until there are no video ids - it means we got
1990 self.report_download_page(username, pagenum)
1991 url = page_base + "&page=" + str(pagenum)
1992 request = compat_urllib_request.Request( url )
1994 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1995 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1996 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1999 # Extract video identifiers
2002 for mobj in re.finditer(r'href="/([^"]+)"', page):
2003 if mobj.group(1) not in ids_in_page:
2004 ids_in_page.append(unescapeHTML(mobj.group(1)))
2006 video_ids.extend(ids_in_page)
2008 # A little optimization - if current page is not
2009 # "full", ie. does not contain PAGE_SIZE video ids then
2010 # we can assume that this page is the last one - there
2011 # are no more ids on further pages - no need to query
2014 if len(ids_in_page) < self._PAGE_SIZE:
2019 all_ids_count = len(video_ids)
2020 playliststart = self._downloader.params.get('playliststart', 1) - 1
2021 playlistend = self._downloader.params.get('playlistend', -1)
2023 if playlistend == -1:
2024 video_ids = video_ids[playliststart:]
2026 video_ids = video_ids[playliststart:playlistend]
2028 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2029 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2031 for video_id in video_ids:
2032 self._downloader.download([u'http://blip.tv/'+video_id])
2035 class DepositFilesIE(InfoExtractor):
2036 """Information extractor for depositfiles.com"""
2038 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2040 def report_download_webpage(self, file_id):
2041 """Report webpage download."""
2042 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2044 def report_extraction(self, file_id):
2045 """Report information extraction."""
2046 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2048 def _real_extract(self, url):
2049 file_id = url.split('/')[-1]
2050 # Rebuild url in english locale
2051 url = 'http://depositfiles.com/en/files/' + file_id
2053 # Retrieve file webpage with 'Free download' button pressed
2054 free_download_indication = { 'gateway_result' : '1' }
2055 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2057 self.report_download_webpage(file_id)
2058 webpage = compat_urllib_request.urlopen(request).read()
2059 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2060 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2063 # Search for the real file URL
2064 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2065 if (mobj is None) or (mobj.group(1) is None):
2066 # Try to figure out reason of the error.
2067 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2068 if (mobj is not None) and (mobj.group(1) is not None):
2069 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2070 self._downloader.report_error(u'%s' % restriction_message)
2072 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2075 file_url = mobj.group(1)
2076 file_extension = os.path.splitext(file_url)[1][1:]
2078 # Search for file title
2079 mobj = re.search(r'<b title="(.*?)">', webpage)
2081 self._downloader.report_error(u'unable to extract title')
2083 file_title = mobj.group(1).decode('utf-8')
2086 'id': file_id.decode('utf-8'),
2087 'url': file_url.decode('utf-8'),
2089 'upload_date': None,
2090 'title': file_title,
2091 'ext': file_extension.decode('utf-8'),
2095 class FacebookIE(InfoExtractor):
2096 """Information Extractor for Facebook"""
2098 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2099 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2100 _NETRC_MACHINE = 'facebook'
2101 IE_NAME = u'facebook'
2103 def report_login(self):
2104 """Report attempt to log in."""
2105 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2107 def _real_initialize(self):
2108 if self._downloader is None:
2113 downloader_params = self._downloader.params
2115 # Attempt to use provided username and password or .netrc data
2116 if downloader_params.get('username', None) is not None:
2117 useremail = downloader_params['username']
2118 password = downloader_params['password']
2119 elif downloader_params.get('usenetrc', False):
2121 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2122 if info is not None:
2126 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2127 except (IOError, netrc.NetrcParseError) as err:
2128 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2131 if useremail is None:
2140 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2143 login_results = compat_urllib_request.urlopen(request).read()
2144 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2145 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2147 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2148 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2151 def _real_extract(self, url):
2152 mobj = re.match(self._VALID_URL, url)
2154 self._downloader.report_error(u'invalid URL: %s' % url)
2156 video_id = mobj.group('ID')
2158 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2159 webpage = self._download_webpage(url, video_id)
2161 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2162 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2163 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2165 raise ExtractorError(u'Cannot parse data')
2166 data = dict(json.loads(m.group(1)))
2167 params_raw = compat_urllib_parse.unquote(data['params'])
2168 params = json.loads(params_raw)
2169 video_url = params['hd_src']
2171 video_url = params['sd_src']
2173 raise ExtractorError(u'Cannot find video URL')
2174 video_duration = int(params['video_duration'])
2176 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2178 raise ExtractorError(u'Cannot find title in webpage')
2179 video_title = unescapeHTML(m.group(1))
2183 'title': video_title,
2186 'duration': video_duration,
2187 'thumbnail': params['thumbnail_src'],
2192 class BlipTVIE(InfoExtractor):
2193 """Information extractor for blip.tv"""
2195 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2196 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2197 IE_NAME = u'blip.tv'
2199 def report_extraction(self, file_id):
2200 """Report information extraction."""
2201 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2203 def report_direct_download(self, title):
2204 """Report information extraction."""
2205 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2207 def _real_extract(self, url):
2208 mobj = re.match(self._VALID_URL, url)
2210 self._downloader.report_error(u'invalid URL: %s' % url)
2213 urlp = compat_urllib_parse_urlparse(url)
2214 if urlp.path.startswith('/play/'):
2215 request = compat_urllib_request.Request(url)
2216 response = compat_urllib_request.urlopen(request)
2217 redirecturl = response.geturl()
2218 rurlp = compat_urllib_parse_urlparse(redirecturl)
2219 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2220 url = 'http://blip.tv/a/a-' + file_id
2221 return self._real_extract(url)
2228 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2229 request = compat_urllib_request.Request(json_url)
2230 request.add_header('User-Agent', 'iTunes/10.6.1')
2231 self.report_extraction(mobj.group(1))
2234 urlh = compat_urllib_request.urlopen(request)
2235 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2236 basename = url.split('/')[-1]
2237 title,ext = os.path.splitext(basename)
2238 title = title.decode('UTF-8')
2239 ext = ext.replace('.', '')
2240 self.report_direct_download(title)
2245 'upload_date': None,
2250 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2251 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2252 if info is None: # Regular URL
2254 json_code_bytes = urlh.read()
2255 json_code = json_code_bytes.decode('utf-8')
2256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2257 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2261 json_data = json.loads(json_code)
2262 if 'Post' in json_data:
2263 data = json_data['Post']
2267 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2268 video_url = data['media']['url']
2269 umobj = re.match(self._URL_EXT, video_url)
2271 raise ValueError('Can not determine filename extension')
2272 ext = umobj.group(1)
2275 'id': data['item_id'],
2277 'uploader': data['display_name'],
2278 'upload_date': upload_date,
2279 'title': data['title'],
2281 'format': data['media']['mimeType'],
2282 'thumbnail': data['thumbnailUrl'],
2283 'description': data['description'],
2284 'player_url': data['embedUrl'],
2285 'user_agent': 'iTunes/10.6.1',
2287 except (ValueError,KeyError) as err:
2288 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2294 class MyVideoIE(InfoExtractor):
2295 """Information Extractor for myvideo.de."""
2297 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2298 IE_NAME = u'myvideo'
2300 def __init__(self, downloader=None):
2301 InfoExtractor.__init__(self, downloader)
2303 def report_extraction(self, video_id):
2304 """Report information extraction."""
2305 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2307 def _real_extract(self,url):
2308 mobj = re.match(self._VALID_URL, url)
2310 self._download.report_error(u'invalid URL: %s' % url)
2313 video_id = mobj.group(1)
2316 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2317 webpage = self._download_webpage(webpage_url, video_id)
2319 self.report_extraction(video_id)
2320 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2323 self._downloader.report_error(u'unable to extract media URL')
2325 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2327 mobj = re.search('<title>([^<]+)</title>', webpage)
2329 self._downloader.report_error(u'unable to extract title')
2332 video_title = mobj.group(1)
2338 'upload_date': None,
2339 'title': video_title,
2343 class ComedyCentralIE(InfoExtractor):
2344 """Information extractor for The Daily Show and Colbert Report """
2346 # urls can be abbreviations like :thedailyshow or :colbert
2347 # urls for episodes like:
2348 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2349 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2350 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2351 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2352 |(https?://)?(www\.)?
2353 (?P<showname>thedailyshow|colbertnation)\.com/
2354 (full-episodes/(?P<episode>.*)|
2356 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2357 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2360 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2362 _video_extensions = {
2370 _video_dimensions = {
2380 def suitable(cls, url):
2381 """Receives a URL and returns True if suitable for this IE."""
2382 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2384 def report_extraction(self, episode_id):
2385 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2387 def report_config_download(self, episode_id, media_id):
2388 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2390 def report_index_download(self, episode_id):
2391 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2393 def _print_formats(self, formats):
2394 print('Available formats:')
2396 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2399 def _real_extract(self, url):
2400 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2402 self._downloader.report_error(u'invalid URL: %s' % url)
2405 if mobj.group('shortname'):
2406 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2407 url = u'http://www.thedailyshow.com/full-episodes/'
2409 url = u'http://www.colbertnation.com/full-episodes/'
2410 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2411 assert mobj is not None
2413 if mobj.group('clip'):
2414 if mobj.group('showname') == 'thedailyshow':
2415 epTitle = mobj.group('tdstitle')
2417 epTitle = mobj.group('cntitle')
2420 dlNewest = not mobj.group('episode')
2422 epTitle = mobj.group('showname')
2424 epTitle = mobj.group('episode')
2426 req = compat_urllib_request.Request(url)
2427 self.report_extraction(epTitle)
2429 htmlHandle = compat_urllib_request.urlopen(req)
2430 html = htmlHandle.read()
2431 webpage = html.decode('utf-8')
2432 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2433 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2436 url = htmlHandle.geturl()
2437 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2439 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2441 if mobj.group('episode') == '':
2442 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2444 epTitle = mobj.group('episode')
2446 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2448 if len(mMovieParams) == 0:
2449 # The Colbert Report embeds the information in a without
2450 # a URL prefix; so extract the alternate reference
2451 # and then add the URL prefix manually.
2453 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2454 if len(altMovieParams) == 0:
2455 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2458 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2460 uri = mMovieParams[0][1]
2461 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2462 self.report_index_download(epTitle)
2464 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2465 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2466 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2471 idoc = xml.etree.ElementTree.fromstring(indexXml)
2472 itemEls = idoc.findall('.//item')
2473 for partNum,itemEl in enumerate(itemEls):
2474 mediaId = itemEl.findall('./guid')[0].text
2475 shortMediaId = mediaId.split(':')[-1]
2476 showId = mediaId.split(':')[-2].replace('.com', '')
2477 officialTitle = itemEl.findall('./title')[0].text
2478 officialDate = itemEl.findall('./pubDate')[0].text
2480 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2481 compat_urllib_parse.urlencode({'uri': mediaId}))
2482 configReq = compat_urllib_request.Request(configUrl)
2483 self.report_config_download(epTitle, shortMediaId)
2485 configXml = compat_urllib_request.urlopen(configReq).read()
2486 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2490 cdoc = xml.etree.ElementTree.fromstring(configXml)
2492 for rendition in cdoc.findall('.//rendition'):
2493 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2497 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2500 if self._downloader.params.get('listformats', None):
2501 self._print_formats([i[0] for i in turls])
2504 # For now, just pick the highest bitrate
2505 format,rtmp_video_url = turls[-1]
2507 # Get the format arg from the arg stream
2508 req_format = self._downloader.params.get('format', None)
2510 # Select format if we can find one
2513 format, rtmp_video_url = f, v
2516 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2518 raise ExtractorError(u'Cannot transform RTMP url')
2519 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2520 video_url = base + m.group('finalid')
2522 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2527 'upload_date': officialDate,
2532 'description': officialTitle,
2534 results.append(info)
2539 class EscapistIE(InfoExtractor):
2540 """Information extractor for The Escapist """
2542 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2543 IE_NAME = u'escapist'
2545 def report_extraction(self, showName):
2546 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2548 def report_config_download(self, showName):
2549 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2551 def _real_extract(self, url):
2552 mobj = re.match(self._VALID_URL, url)
2554 self._downloader.report_error(u'invalid URL: %s' % url)
2556 showName = mobj.group('showname')
2557 videoId = mobj.group('episode')
2559 self.report_extraction(showName)
2561 webPage = compat_urllib_request.urlopen(url)
2562 webPageBytes = webPage.read()
2563 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2564 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2565 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2569 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2570 description = unescapeHTML(descMatch.group(1))
2571 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2572 imgUrl = unescapeHTML(imgMatch.group(1))
2573 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2574 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2575 configUrlMatch = re.search('config=(.*)$', playerUrl)
2576 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2578 self.report_config_download(showName)
2580 configJSON = compat_urllib_request.urlopen(configUrl)
2581 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2582 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2584 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2587 # Technically, it's JavaScript, not JSON
2588 configJSON = configJSON.replace("'", '"')
2591 config = json.loads(configJSON)
2592 except (ValueError,) as err:
2593 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2596 playlist = config['playlist']
2597 videoUrl = playlist[1]['url']
2602 'uploader': showName,
2603 'upload_date': None,
2606 'thumbnail': imgUrl,
2607 'description': description,
2608 'player_url': playerUrl,
2613 class CollegeHumorIE(InfoExtractor):
2614 """Information extractor for collegehumor.com"""
2617 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2618 IE_NAME = u'collegehumor'
2620 def report_manifest(self, video_id):
2621 """Report information extraction."""
2622 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2624 def report_extraction(self, video_id):
2625 """Report information extraction."""
2626 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2628 def _real_extract(self, url):
2629 mobj = re.match(self._VALID_URL, url)
2631 self._downloader.report_error(u'invalid URL: %s' % url)
2633 video_id = mobj.group('videoid')
2638 'upload_date': None,
2641 self.report_extraction(video_id)
2642 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2644 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2645 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2646 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2649 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2651 videoNode = mdoc.findall('./video')[0]
2652 info['description'] = videoNode.findall('./description')[0].text
2653 info['title'] = videoNode.findall('./caption')[0].text
2654 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2655 manifest_url = videoNode.findall('./file')[0].text
2657 self._downloader.report_error(u'Invalid metadata XML file')
2660 manifest_url += '?hdcore=2.10.3'
2661 self.report_manifest(video_id)
2663 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2664 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2665 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2668 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2670 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2671 node_id = media_node.attrib['url']
2672 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2673 except IndexError as err:
2674 self._downloader.report_error(u'Invalid manifest file')
2677 url_pr = compat_urllib_parse_urlparse(manifest_url)
2678 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2685 class XVideosIE(InfoExtractor):
2686 """Information extractor for xvideos.com"""
2688 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2689 IE_NAME = u'xvideos'
2691 def report_extraction(self, video_id):
2692 """Report information extraction."""
2693 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2695 def _real_extract(self, url):
2696 mobj = re.match(self._VALID_URL, url)
2698 self._downloader.report_error(u'invalid URL: %s' % url)
2700 video_id = mobj.group(1)
2702 webpage = self._download_webpage(url, video_id)
2704 self.report_extraction(video_id)
2708 mobj = re.search(r'flv_url=(.+?)&', webpage)
2710 self._downloader.report_error(u'unable to extract video url')
2712 video_url = compat_urllib_parse.unquote(mobj.group(1))
2716 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2718 self._downloader.report_error(u'unable to extract video title')
2720 video_title = mobj.group(1)
2723 # Extract video thumbnail
2724 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2726 self._downloader.report_error(u'unable to extract video thumbnail')
2728 video_thumbnail = mobj.group(0)
2734 'upload_date': None,
2735 'title': video_title,
2737 'thumbnail': video_thumbnail,
2738 'description': None,
2744 class SoundcloudIE(InfoExtractor):
2745 """Information extractor for soundcloud.com
2746 To access the media, the uid of the song and a stream token
2747 must be extracted from the page source and the script must make
2748 a request to media.soundcloud.com/crossdomain.xml. Then
2749 the media can be grabbed by requesting from an url composed
2750 of the stream token and uid
2753 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2754 IE_NAME = u'soundcloud'
2756 def __init__(self, downloader=None):
2757 InfoExtractor.__init__(self, downloader)
2759 def report_resolve(self, video_id):
2760 """Report information extraction."""
2761 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2763 def report_extraction(self, video_id):
2764 """Report information extraction."""
2765 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2767 def _real_extract(self, url):
2768 mobj = re.match(self._VALID_URL, url)
2770 self._downloader.report_error(u'invalid URL: %s' % url)
2773 # extract uploader (which is in the url)
2774 uploader = mobj.group(1)
2775 # extract simple title (uploader + slug of song title)
2776 slug_title = mobj.group(2)
2777 simple_title = uploader + u'-' + slug_title
2779 self.report_resolve('%s/%s' % (uploader, slug_title))
2781 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2782 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2783 request = compat_urllib_request.Request(resolv_url)
2785 info_json_bytes = compat_urllib_request.urlopen(request).read()
2786 info_json = info_json_bytes.decode('utf-8')
2787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2788 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2791 info = json.loads(info_json)
2792 video_id = info['id']
2793 self.report_extraction('%s/%s' % (uploader, slug_title))
2795 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2796 request = compat_urllib_request.Request(streams_url)
2798 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2799 stream_json = stream_json_bytes.decode('utf-8')
2800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2804 streams = json.loads(stream_json)
2805 mediaURL = streams['http_mp3_128_url']
2810 'uploader': info['user']['username'],
2811 'upload_date': info['created_at'],
2812 'title': info['title'],
2814 'description': info['description'],
2817 class SoundcloudSetIE(InfoExtractor):
2818 """Information extractor for soundcloud.com sets
2819 To access the media, the uid of the song and a stream token
2820 must be extracted from the page source and the script must make
2821 a request to media.soundcloud.com/crossdomain.xml. Then
2822 the media can be grabbed by requesting from an url composed
2823 of the stream token and uid
2826 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2827 IE_NAME = u'soundcloud'
2829 def __init__(self, downloader=None):
2830 InfoExtractor.__init__(self, downloader)
2832 def report_resolve(self, video_id):
2833 """Report information extraction."""
2834 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2836 def report_extraction(self, video_id):
2837 """Report information extraction."""
2838 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2840 def _real_extract(self, url):
2841 mobj = re.match(self._VALID_URL, url)
2843 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2846 # extract uploader (which is in the url)
2847 uploader = mobj.group(1)
2848 # extract simple title (uploader + slug of song title)
2849 slug_title = mobj.group(2)
2850 simple_title = uploader + u'-' + slug_title
2852 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2854 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2855 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2856 request = compat_urllib_request.Request(resolv_url)
2858 info_json_bytes = compat_urllib_request.urlopen(request).read()
2859 info_json = info_json_bytes.decode('utf-8')
2860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2861 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2865 info = json.loads(info_json)
2866 if 'errors' in info:
2867 for err in info['errors']:
2868 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2871 for track in info['tracks']:
2872 video_id = track['id']
2873 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2875 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2876 request = compat_urllib_request.Request(streams_url)
2878 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2879 stream_json = stream_json_bytes.decode('utf-8')
2880 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2881 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2884 streams = json.loads(stream_json)
2885 mediaURL = streams['http_mp3_128_url']
2890 'uploader': track['user']['username'],
2891 'upload_date': track['created_at'],
2892 'title': track['title'],
2894 'description': track['description'],
2899 class InfoQIE(InfoExtractor):
2900 """Information extractor for infoq.com"""
2901 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2903 def report_extraction(self, video_id):
2904 """Report information extraction."""
2905 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2907 def _real_extract(self, url):
2908 mobj = re.match(self._VALID_URL, url)
2910 self._downloader.report_error(u'invalid URL: %s' % url)
2913 webpage = self._download_webpage(url, video_id=url)
2914 self.report_extraction(url)
2917 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2919 self._downloader.report_error(u'unable to extract video url')
2921 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2922 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2925 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2927 self._downloader.report_error(u'unable to extract video title')
2929 video_title = mobj.group(1)
2931 # Extract description
2932 video_description = u'No description available.'
2933 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2934 if mobj is not None:
2935 video_description = mobj.group(1)
2937 video_filename = video_url.split('/')[-1]
2938 video_id, extension = video_filename.split('.')
2944 'upload_date': None,
2945 'title': video_title,
2946 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2948 'description': video_description,
2953 class MixcloudIE(InfoExtractor):
2954 """Information extractor for www.mixcloud.com"""
2956 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2957 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2958 IE_NAME = u'mixcloud'
2960 def __init__(self, downloader=None):
2961 InfoExtractor.__init__(self, downloader)
2963 def report_download_json(self, file_id):
2964 """Report JSON download."""
2965 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2967 def report_extraction(self, file_id):
2968 """Report information extraction."""
2969 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2971 def get_urls(self, jsonData, fmt, bitrate='best'):
2972 """Get urls from 'audio_formats' section in json"""
2975 bitrate_list = jsonData[fmt]
2976 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2977 bitrate = max(bitrate_list) # select highest
2979 url_list = jsonData[fmt][bitrate]
2980 except TypeError: # we have no bitrate info.
2981 url_list = jsonData[fmt]
2984 def check_urls(self, url_list):
2985 """Returns 1st active url from list"""
2986 for url in url_list:
2988 compat_urllib_request.urlopen(url)
2990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2995 def _print_formats(self, formats):
2996 print('Available formats:')
2997 for fmt in formats.keys():
2998 for b in formats[fmt]:
3000 ext = formats[fmt][b][0]
3001 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3002 except TypeError: # we have no bitrate info
3003 ext = formats[fmt][0]
3004 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3007 def _real_extract(self, url):
3008 mobj = re.match(self._VALID_URL, url)
3010 self._downloader.report_error(u'invalid URL: %s' % url)
3012 # extract uploader & filename from url
3013 uploader = mobj.group(1).decode('utf-8')
3014 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3016 # construct API request
3017 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3018 # retrieve .json file with links to files
3019 request = compat_urllib_request.Request(file_url)
3021 self.report_download_json(file_url)
3022 jsonData = compat_urllib_request.urlopen(request).read()
3023 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3024 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3028 json_data = json.loads(jsonData)
3029 player_url = json_data['player_swf_url']
3030 formats = dict(json_data['audio_formats'])
3032 req_format = self._downloader.params.get('format', None)
3035 if self._downloader.params.get('listformats', None):
3036 self._print_formats(formats)
3039 if req_format is None or req_format == 'best':
3040 for format_param in formats.keys():
3041 url_list = self.get_urls(formats, format_param)
3043 file_url = self.check_urls(url_list)
3044 if file_url is not None:
3047 if req_format not in formats:
3048 self._downloader.report_error(u'format is not available')
3051 url_list = self.get_urls(formats, req_format)
3052 file_url = self.check_urls(url_list)
3053 format_param = req_format
3056 'id': file_id.decode('utf-8'),
3057 'url': file_url.decode('utf-8'),
3058 'uploader': uploader.decode('utf-8'),
3059 'upload_date': None,
3060 'title': json_data['name'],
3061 'ext': file_url.split('.')[-1].decode('utf-8'),
3062 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3063 'thumbnail': json_data['thumbnail_url'],
3064 'description': json_data['description'],
3065 'player_url': player_url.decode('utf-8'),
3068 class StanfordOpenClassroomIE(InfoExtractor):
3069 """Information extractor for Stanford's Open ClassRoom"""
3071 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3072 IE_NAME = u'stanfordoc'
3074 def report_download_webpage(self, objid):
3075 """Report information extraction."""
3076 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3078 def report_extraction(self, video_id):
3079 """Report information extraction."""
3080 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3082 def _real_extract(self, url):
3083 mobj = re.match(self._VALID_URL, url)
3085 raise ExtractorError(u'Invalid URL: %s' % url)
3087 if mobj.group('course') and mobj.group('video'): # A specific video
3088 course = mobj.group('course')
3089 video = mobj.group('video')
3091 'id': course + '_' + video,
3093 'upload_date': None,
3096 self.report_extraction(info['id'])
3097 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3098 xmlUrl = baseUrl + video + '.xml'
3100 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3101 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3102 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3104 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3106 info['title'] = mdoc.findall('./title')[0].text
3107 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3109 self._downloader.report_error(u'Invalid metadata XML file')
3111 info['ext'] = info['url'].rpartition('.')[2]
3113 elif mobj.group('course'): # A course page
3114 course = mobj.group('course')
3119 'upload_date': None,
3122 coursepage = self._download_webpage(url, info['id'],
3123 note='Downloading course info page',
3124 errnote='Unable to download course info page')
3126 m = re.search('<h1>([^<]+)</h1>', coursepage)
3128 info['title'] = unescapeHTML(m.group(1))
3130 info['title'] = info['id']
3132 m = re.search('<description>([^<]+)</description>', coursepage)
3134 info['description'] = unescapeHTML(m.group(1))
3136 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3139 'type': 'reference',
3140 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3144 for entry in info['list']:
3145 assert entry['type'] == 'reference'
3146 results += self.extract(entry['url'])
3150 'id': 'Stanford OpenClassroom',
3153 'upload_date': None,
3156 self.report_download_webpage(info['id'])
3157 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3159 rootpage = compat_urllib_request.urlopen(rootURL).read()
3160 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3164 info['title'] = info['id']
3166 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3169 'type': 'reference',
3170 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3175 for entry in info['list']:
3176 assert entry['type'] == 'reference'
3177 results += self.extract(entry['url'])
3180 class MTVIE(InfoExtractor):
3181 """Information extractor for MTV.com"""
3183 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3186 def report_extraction(self, video_id):
3187 """Report information extraction."""
3188 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3190 def _real_extract(self, url):
3191 mobj = re.match(self._VALID_URL, url)
3193 self._downloader.report_error(u'invalid URL: %s' % url)
3195 if not mobj.group('proto'):
3196 url = 'http://' + url
3197 video_id = mobj.group('videoid')
3199 webpage = self._download_webpage(url, video_id)
3201 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3203 self._downloader.report_error(u'unable to extract song name')
3205 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3206 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3208 self._downloader.report_error(u'unable to extract performer')
3210 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211 video_title = performer + ' - ' + song_name
3213 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3215 self._downloader.report_error(u'unable to mtvn_uri')
3217 mtvn_uri = mobj.group(1)
3219 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3221 self._downloader.report_error(u'unable to extract content id')
3223 content_id = mobj.group(1)
3225 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3226 self.report_extraction(video_id)
3227 request = compat_urllib_request.Request(videogen_url)
3229 metadataXml = compat_urllib_request.urlopen(request).read()
3230 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3231 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3234 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3235 renditions = mdoc.findall('.//rendition')
3237 # For now, always pick the highest quality.
3238 rendition = renditions[-1]
3241 _,_,ext = rendition.attrib['type'].partition('/')
3242 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3243 video_url = rendition.find('./src').text
3245 self._downloader.trouble('Invalid rendition field.')
3251 'uploader': performer,
3252 'upload_date': None,
3253 'title': video_title,
3261 class YoukuIE(InfoExtractor):
3262 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3264 def report_download_webpage(self, file_id):
3265 """Report webpage download."""
3266 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3268 def report_extraction(self, file_id):
3269 """Report information extraction."""
3270 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3273 nowTime = int(time.time() * 1000)
3274 random1 = random.randint(1000,1998)
3275 random2 = random.randint(1000,9999)
3277 return "%d%d%d" %(nowTime,random1,random2)
3279 def _get_file_ID_mix_string(self, seed):
3281 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3283 for i in range(len(source)):
3284 seed = (seed * 211 + 30031 ) % 65536
3285 index = math.floor(seed / 65536 * len(source) )
3286 mixed.append(source[int(index)])
3287 source.remove(source[int(index)])
3288 #return ''.join(mixed)
3291 def _get_file_id(self, fileId, seed):
3292 mixed = self._get_file_ID_mix_string(seed)
3293 ids = fileId.split('*')
3297 realId.append(mixed[int(ch)])
3298 return ''.join(realId)
3300 def _real_extract(self, url):
3301 mobj = re.match(self._VALID_URL, url)
3303 self._downloader.report_error(u'invalid URL: %s' % url)
3305 video_id = mobj.group('ID')
3307 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3309 request = compat_urllib_request.Request(info_url, None, std_headers)
3311 self.report_download_webpage(video_id)
3312 jsondata = compat_urllib_request.urlopen(request).read()
3313 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3314 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3317 self.report_extraction(video_id)
3319 jsonstr = jsondata.decode('utf-8')
3320 config = json.loads(jsonstr)
3322 video_title = config['data'][0]['title']
3323 seed = config['data'][0]['seed']
3325 format = self._downloader.params.get('format', None)
3326 supported_format = list(config['data'][0]['streamfileids'].keys())
3328 if format is None or format == 'best':
3329 if 'hd2' in supported_format:
3334 elif format == 'worst':
3342 fileid = config['data'][0]['streamfileids'][format]
3343 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3344 except (UnicodeDecodeError, ValueError, KeyError):
3345 self._downloader.report_error(u'unable to extract info section')
3349 sid = self._gen_sid()
3350 fileid = self._get_file_id(fileid, seed)
3352 #column 8,9 of fileid represent the segment number
3353 #fileid[7:9] should be changed
3354 for index, key in enumerate(keys):
3356 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3357 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3360 'id': '%s_part%02d' % (video_id, index),
3361 'url': download_url,
3363 'upload_date': None,
3364 'title': video_title,
3367 files_info.append(info)
3372 class XNXXIE(InfoExtractor):
3373 """Information extractor for xnxx.com"""
3375 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3377 VIDEO_URL_RE = r'flv_url=(.*?)&'
3378 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3379 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3381 def report_webpage(self, video_id):
3382 """Report information extraction"""
3383 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3385 def report_extraction(self, video_id):
3386 """Report information extraction"""
3387 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3389 def _real_extract(self, url):
3390 mobj = re.match(self._VALID_URL, url)
3392 self._downloader.report_error(u'invalid URL: %s' % url)
3394 video_id = mobj.group(1)
3396 self.report_webpage(video_id)
3398 # Get webpage content
3400 webpage_bytes = compat_urllib_request.urlopen(url).read()
3401 webpage = webpage_bytes.decode('utf-8')
3402 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3403 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3406 result = re.search(self.VIDEO_URL_RE, webpage)
3408 self._downloader.report_error(u'unable to extract video url')
3410 video_url = compat_urllib_parse.unquote(result.group(1))
3412 result = re.search(self.VIDEO_TITLE_RE, webpage)
3414 self._downloader.report_error(u'unable to extract video title')
3416 video_title = result.group(1)
3418 result = re.search(self.VIDEO_THUMB_RE, webpage)
3420 self._downloader.report_error(u'unable to extract video thumbnail')
3422 video_thumbnail = result.group(1)
3428 'upload_date': None,
3429 'title': video_title,
3431 'thumbnail': video_thumbnail,
3432 'description': None,
3436 class GooglePlusIE(InfoExtractor):
3437 """Information extractor for plus.google.com."""
3439 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3440 IE_NAME = u'plus.google'
3442 def __init__(self, downloader=None):
3443 InfoExtractor.__init__(self, downloader)
3445 def report_extract_entry(self, url):
3446 """Report downloading extry"""
3447 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3449 def report_date(self, upload_date):
3450 """Report downloading extry"""
3451 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3453 def report_uploader(self, uploader):
3454 """Report downloading extry"""
3455 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3457 def report_title(self, video_title):
3458 """Report downloading extry"""
3459 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3461 def report_extract_vid_page(self, video_page):
3462 """Report information extraction."""
3463 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3465 def _real_extract(self, url):
3466 # Extract id from URL
3467 mobj = re.match(self._VALID_URL, url)
3469 self._downloader.report_error(u'Invalid URL: %s' % url)
3472 post_url = mobj.group(0)
3473 video_id = mobj.group(1)
3475 video_extension = 'flv'
3477 # Step 1, Retrieve post webpage to extract further information
3478 self.report_extract_entry(post_url)
3479 request = compat_urllib_request.Request(post_url)
3481 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3486 # Extract update date
3488 pattern = 'title="Timestamp">(.*?)</a>'
3489 mobj = re.search(pattern, webpage)
3491 upload_date = mobj.group(1)
3492 # Convert timestring to a format suitable for filename
3493 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3494 upload_date = upload_date.strftime('%Y%m%d')
3495 self.report_date(upload_date)
3499 pattern = r'rel\="author".*?>(.*?)</a>'
3500 mobj = re.search(pattern, webpage)
3502 uploader = mobj.group(1)
3503 self.report_uploader(uploader)
3506 # Get the first line for title
3508 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3509 mobj = re.search(pattern, webpage)
3511 video_title = mobj.group(1)
3512 self.report_title(video_title)
3514 # Step 2, Stimulate clicking the image box to launch video
3515 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3516 mobj = re.search(pattern, webpage)
3518 self._downloader.report_error(u'unable to extract video page URL')
3520 video_page = mobj.group(1)
3521 request = compat_urllib_request.Request(video_page)
3523 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3524 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3525 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3527 self.report_extract_vid_page(video_page)
3530 # Extract video links on video page
3531 """Extract video links of all sizes"""
3532 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3533 mobj = re.findall(pattern, webpage)
3535 self._downloader.report_error(u'unable to extract video links')
3537 # Sort in resolution
3538 links = sorted(mobj)
3540 # Choose the lowest of the sort, i.e. highest resolution
3541 video_url = links[-1]
3542 # Only get the url. The resolution part in the tuple has no use anymore
3543 video_url = video_url[-1]
3544 # Treat escaped \u0026 style hex
3546 video_url = video_url.decode("unicode_escape")
3547 except AttributeError: # Python 3
3548 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3554 'uploader': uploader,
3555 'upload_date': upload_date,
3556 'title': video_title,
3557 'ext': video_extension,
3560 class NBAIE(InfoExtractor):
3561 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3564 def _real_extract(self, url):
3565 mobj = re.match(self._VALID_URL, url)
3567 self._downloader.report_error(u'invalid URL: %s' % url)
3570 video_id = mobj.group(1)
3571 if video_id.endswith('/index.html'):
3572 video_id = video_id[:-len('/index.html')]
3574 webpage = self._download_webpage(url, video_id)
3576 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3577 def _findProp(rexp, default=None):
3578 m = re.search(rexp, webpage)
3580 return unescapeHTML(m.group(1))
3584 shortened_video_id = video_id.rpartition('/')[2]
3585 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3587 'id': shortened_video_id,
3591 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3592 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3596 class JustinTVIE(InfoExtractor):
3597 """Information extractor for justin.tv and twitch.tv"""
3598 # TODO: One broadcast may be split into multiple videos. The key
3599 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3600 # starts at 1 and increases. Can we treat all parts as one video?
3602 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3603 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3604 _JUSTIN_PAGE_LIMIT = 100
3605 IE_NAME = u'justin.tv'
3607 def report_extraction(self, file_id):
3608 """Report information extraction."""
3609 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3611 def report_download_page(self, channel, offset):
3612 """Report attempt to download a single page of videos."""
3613 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3614 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3616 # Return count of items, list of *valid* items
3617 def _parse_page(self, url):
3619 urlh = compat_urllib_request.urlopen(url)
3620 webpage_bytes = urlh.read()
3621 webpage = webpage_bytes.decode('utf-8', 'ignore')
3622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3623 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3626 response = json.loads(webpage)
3627 if type(response) != list:
3628 error_text = response.get('error', 'unknown error')
3629 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3632 for clip in response:
3633 video_url = clip['video_file_url']
3635 video_extension = os.path.splitext(video_url)[1][1:]
3636 video_date = re.sub('-', '', clip['start_time'][:10])
3637 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3638 video_id = clip['id']
3639 video_title = clip.get('title', video_id)
3643 'title': video_title,
3644 'uploader': clip.get('channel_name', video_uploader_id),
3645 'uploader_id': video_uploader_id,
3646 'upload_date': video_date,
3647 'ext': video_extension,
3649 return (len(response), info)
3651 def _real_extract(self, url):
3652 mobj = re.match(self._VALID_URL, url)
3654 self._downloader.report_error(u'invalid URL: %s' % url)
3657 api = 'http://api.justin.tv'
3658 video_id = mobj.group(mobj.lastindex)
3660 if mobj.lastindex == 1:
3662 api += '/channel/archives/%s.json'
3664 api += '/broadcast/by_archive/%s.json'
3665 api = api % (video_id,)
3667 self.report_extraction(video_id)
3671 limit = self._JUSTIN_PAGE_LIMIT
3674 self.report_download_page(video_id, offset)
3675 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3676 page_count, page_info = self._parse_page(page_url)
3677 info.extend(page_info)
3678 if not paged or page_count != limit:
3683 class FunnyOrDieIE(InfoExtractor):
3684 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3686 def _real_extract(self, url):
3687 mobj = re.match(self._VALID_URL, url)
3689 self._downloader.report_error(u'invalid URL: %s' % url)
3692 video_id = mobj.group('id')
3693 webpage = self._download_webpage(url, video_id)
3695 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3697 self._downloader.report_error(u'unable to find video information')
3698 video_url = unescapeHTML(m.group('url'))
3700 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3702 self._downloader.trouble(u'Cannot find video title')
3703 title = clean_html(m.group('title'))
3705 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3707 desc = unescapeHTML(m.group('desc'))
3716 'description': desc,
3720 class SteamIE(InfoExtractor):
3721 _VALID_URL = r"""http://store.steampowered.com/
3722 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3724 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3728 def suitable(cls, url):
3729 """Receives a URL and returns True if suitable for this IE."""
3730 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3732 def _real_extract(self, url):
3733 m = re.match(self._VALID_URL, url, re.VERBOSE)
3734 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3735 gameID = m.group('gameID')
3736 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3737 webpage = self._download_webpage(videourl, gameID)
3738 mweb = re.finditer(urlRE, webpage)
3739 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3740 titles = re.finditer(namesRE, webpage)
3741 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3742 thumbs = re.finditer(thumbsRE, webpage)
3744 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3745 video_id = vid.group('videoID')
3746 title = vtitle.group('videoName')
3747 video_url = vid.group('videoURL')
3748 video_thumb = thumb.group('thumbnail')
3750 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3755 'title': unescapeHTML(title),
3756 'thumbnail': video_thumb
3761 class UstreamIE(InfoExtractor):
3762 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3763 IE_NAME = u'ustream'
3765 def _real_extract(self, url):
3766 m = re.match(self._VALID_URL, url)
3767 video_id = m.group('videoID')
3768 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3769 webpage = self._download_webpage(url, video_id)
3770 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3771 title = m.group('title')
3772 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3773 uploader = m.group('uploader')
3779 'uploader': uploader
3783 class WorldStarHipHopIE(InfoExtractor):
3784 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3785 IE_NAME = u'WorldStarHipHop'
3787 def _real_extract(self, url):
3788 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3790 webpage_src = compat_urllib_request.urlopen(url).read()
3791 webpage_src = webpage_src.decode('utf-8')
3793 mobj = re.search(_src_url, webpage_src)
3795 m = re.match(self._VALID_URL, url)
3796 video_id = m.group('id')
3798 if mobj is not None:
3799 video_url = mobj.group()
3800 if 'mp4' in video_url:
3805 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3808 _title = r"""<title>(.*)</title>"""
3810 mobj = re.search(_title, webpage_src)
3812 if mobj is not None:
3813 title = mobj.group(1)
3815 title = 'World Start Hip Hop - %s' % time.ctime()
3817 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3818 mobj = re.search(_thumbnail, webpage_src)
3820 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3821 if mobj is not None:
3822 thumbnail = mobj.group(1)
3824 _title = r"""candytitles.*>(.*)</span>"""
3825 mobj = re.search(_title, webpage_src)
3826 if mobj is not None:
3827 title = mobj.group(1)
3834 'thumbnail' : thumbnail,
3839 class RBMARadioIE(InfoExtractor):
3840 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3842 def _real_extract(self, url):
3843 m = re.match(self._VALID_URL, url)
3844 video_id = m.group('videoID')
3846 webpage = self._download_webpage(url, video_id)
3847 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3849 raise ExtractorError(u'Cannot find metadata')
3850 json_data = m.group(1)
3853 data = json.loads(json_data)
3854 except ValueError as e:
3855 raise ExtractorError(u'Invalid JSON: ' + str(e))
3857 video_url = data['akamai_url'] + '&cbr=256'
3858 url_parts = compat_urllib_parse_urlparse(video_url)
3859 video_ext = url_parts.path.rpartition('.')[2]
3864 'title': data['title'],
3865 'description': data.get('teaser_text'),
3866 'location': data.get('country_of_origin'),
3867 'uploader': data.get('host', {}).get('name'),
3868 'uploader_id': data.get('host', {}).get('slug'),
3869 'thumbnail': data.get('image', {}).get('large_url_2x'),
3870 'duration': data.get('duration'),
3875 class YouPornIE(InfoExtractor):
3876 """Information extractor for youporn.com."""
3877 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3879 def _print_formats(self, formats):
3880 """Print all available formats"""
3881 print(u'Available formats:')
3882 print(u'ext\t\tformat')
3883 print(u'---------------------------------')
3884 for format in formats:
3885 print(u'%s\t\t%s' % (format['ext'], format['format']))
3887 def _specific(self, req_format, formats):
3889 if(x["format"]==req_format):
3893 def _real_extract(self, url):
3894 mobj = re.match(self._VALID_URL, url)
3896 self._downloader.report_error(u'invalid URL: %s' % url)
3899 video_id = mobj.group('videoid')
3901 req = compat_urllib_request.Request(url)
3902 req.add_header('Cookie', 'age_verified=1')
3903 webpage = self._download_webpage(req, video_id)
3905 # Get the video title
3906 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3908 raise ExtractorError(u'Unable to extract video title')
3909 video_title = result.group('title').strip()
3911 # Get the video date
3912 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3914 self._downloader.report_warning(u'unable to extract video date')
3917 upload_date = result.group('date').strip()
3919 # Get the video uploader
3920 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3922 self._downloader.report_warning(u'unable to extract uploader')
3923 video_uploader = None
3925 video_uploader = result.group('uploader').strip()
3926 video_uploader = clean_html( video_uploader )
3928 # Get all of the formats available
3929 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3930 result = re.search(DOWNLOAD_LIST_RE, webpage)
3932 raise ExtractorError(u'Unable to extract download list')
3933 download_list_html = result.group('download_list').strip()
3935 # Get all of the links from the page
3936 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3937 links = re.findall(LINK_RE, download_list_html)
3938 if(len(links) == 0):
3939 raise ExtractorError(u'ERROR: no known formats available for video')
3941 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3946 # A link looks like this:
3947 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3948 # A path looks like this:
3949 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3950 video_url = unescapeHTML( link )
3951 path = compat_urllib_parse_urlparse( video_url ).path
3952 extension = os.path.splitext( path )[1][1:]
3953 format = path.split('/')[4].split('_')[:2]
3956 format = "-".join( format )
3957 title = u'%s-%s-%s' % (video_title, size, bitrate)
3962 'uploader': video_uploader,
3963 'upload_date': upload_date,
3968 'description': None,
3972 if self._downloader.params.get('listformats', None):
3973 self._print_formats(formats)
3976 req_format = self._downloader.params.get('format', None)
3977 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3979 if req_format is None or req_format == 'best':
3981 elif req_format == 'worst':
3982 return [formats[-1]]
3983 elif req_format in ('-1', 'all'):
3986 format = self._specific( req_format, formats )
3988 self._downloader.report_error(u'requested format not available')
3994 class PornotubeIE(InfoExtractor):
3995 """Information extractor for pornotube.com."""
3996 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3998 def _real_extract(self, url):
3999 mobj = re.match(self._VALID_URL, url)
4001 self._downloader.report_error(u'invalid URL: %s' % url)
4004 video_id = mobj.group('videoid')
4005 video_title = mobj.group('title')
4007 # Get webpage content
4008 webpage = self._download_webpage(url, video_id)
4011 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4012 result = re.search(VIDEO_URL_RE, webpage)
4014 self._downloader.report_error(u'unable to extract video url')
4016 video_url = compat_urllib_parse.unquote(result.group('url'))
4018 #Get the uploaded date
4019 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4020 result = re.search(VIDEO_UPLOADED_RE, webpage)
4022 self._downloader.report_error(u'unable to extract video title')
4024 upload_date = result.group('date')
4026 info = {'id': video_id,
4029 'upload_date': upload_date,
4030 'title': video_title,
4036 class YouJizzIE(InfoExtractor):
4037 """Information extractor for youjizz.com."""
4038 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4040 def _real_extract(self, url):
4041 mobj = re.match(self._VALID_URL, url)
4043 self._downloader.report_error(u'invalid URL: %s' % url)
4046 video_id = mobj.group('videoid')
4048 # Get webpage content
4049 webpage = self._download_webpage(url, video_id)
4051 # Get the video title
4052 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4054 raise ExtractorError(u'ERROR: unable to extract video title')
4055 video_title = result.group('title').strip()
4057 # Get the embed page
4058 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4060 raise ExtractorError(u'ERROR: unable to extract embed page')
4062 embed_page_url = result.group(0).strip()
4063 video_id = result.group('videoid')
4065 webpage = self._download_webpage(embed_page_url, video_id)
4068 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4070 raise ExtractorError(u'ERROR: unable to extract video url')
4071 video_url = result.group('source')
4073 info = {'id': video_id,
4075 'title': video_title,
4078 'player_url': embed_page_url}
4082 class EightTracksIE(InfoExtractor):
4084 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4086 def _real_extract(self, url):
4087 mobj = re.match(self._VALID_URL, url)
4089 raise ExtractorError(u'Invalid URL: %s' % url)
4090 playlist_id = mobj.group('id')
4092 webpage = self._download_webpage(url, playlist_id)
4094 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4096 raise ExtractorError(u'Cannot find trax information')
4097 json_like = m.group(1)
4098 data = json.loads(json_like)
4100 session = str(random.randint(0, 1000000000))
4102 track_count = data['tracks_count']
4103 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4104 next_url = first_url
4106 for i in itertools.count():
4107 api_json = self._download_webpage(next_url, playlist_id,
4108 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4109 errnote=u'Failed to download song information')
4110 api_data = json.loads(api_json)
4111 track_data = api_data[u'set']['track']
4113 'id': track_data['id'],
4114 'url': track_data['track_file_stream_url'],
4115 'title': track_data['performer'] + u' - ' + track_data['name'],
4116 'raw_title': track_data['name'],
4117 'uploader_id': data['user']['login'],
4121 if api_data['set']['at_last_track']:
4123 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4126 class KeekIE(InfoExtractor):
4127 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4130 def _real_extract(self, url):
4131 m = re.match(self._VALID_URL, url)
4132 video_id = m.group('videoID')
4133 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4134 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4135 webpage = self._download_webpage(url, video_id)
4136 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4137 title = unescapeHTML(m.group('title'))
4138 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4139 uploader = clean_html(m.group('uploader'))
4145 'thumbnail': thumbnail,
4146 'uploader': uploader
4150 class TEDIE(InfoExtractor):
4151 _VALID_URL=r'''http://www.ted.com/
4153 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4155 ((?P<type_talk>talks)) # We have a simple talk
4157 /(?P<name>\w+) # Here goes the name and then ".html"
4161 def suitable(cls, url):
4162 """Receives a URL and returns True if suitable for this IE."""
4163 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4165 def _real_extract(self, url):
4166 m=re.match(self._VALID_URL, url, re.VERBOSE)
4167 if m.group('type_talk'):
4168 return [self._talk_info(url)]
4170 playlist_id=m.group('playlist_id')
4171 name=m.group('name')
4172 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4173 return self._playlist_videos_info(url,name,playlist_id)
4175 def _talk_video_link(self,mediaSlug):
4176 '''Returns the video link for that mediaSlug'''
4177 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4179 def _playlist_videos_info(self,url,name,playlist_id=0):
4180 '''Returns the videos of the playlist'''
4182 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4183 ([.\s]*?)data-playlist_item_id="(\d+)"
4184 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4186 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4187 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4188 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4189 m_names=re.finditer(video_name_RE,webpage)
4191 for m_video, m_name in zip(m_videos,m_names):
4192 video_id=m_video.group('video_id')
4193 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4194 info.append(self._talk_info(talk_url,video_id))
4197 def _talk_info(self, url, video_id=0):
4198 """Return the video for the talk in the url"""
4199 m=re.match(self._VALID_URL, url,re.VERBOSE)
4200 videoName=m.group('name')
4201 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4202 # If the url includes the language we get the title translated
4203 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4204 title=re.search(title_RE, webpage).group('title')
4205 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4206 "id":(?P<videoID>[\d]+).*?
4207 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4208 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4209 thumb_match=re.search(thumb_RE,webpage)
4210 info_match=re.search(info_RE,webpage,re.VERBOSE)
4211 video_id=info_match.group('videoID')
4212 mediaSlug=info_match.group('mediaSlug')
4213 video_url=self._talk_video_link(mediaSlug)
4219 'thumbnail': thumb_match.group('thumbnail')
4223 class MySpassIE(InfoExtractor):
4224 _VALID_URL = r'http://www.myspass.de/.*'
4226 def _real_extract(self, url):
4227 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4229 # video id is the last path element of the URL
4230 # usually there is a trailing slash, so also try the second but last
4231 url_path = compat_urllib_parse_urlparse(url).path
4232 url_parent_path, video_id = os.path.split(url_path)
4234 _, video_id = os.path.split(url_parent_path)
4237 metadata_url = META_DATA_URL_TEMPLATE % video_id
4238 metadata_text = self._download_webpage(metadata_url, video_id)
4239 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4241 # extract values from metadata
4242 url_flv_el = metadata.find('url_flv')
4243 if url_flv_el is None:
4244 self._downloader.report_error(u'unable to extract download url')
4246 video_url = url_flv_el.text
4247 extension = os.path.splitext(video_url)[1][1:]
4248 title_el = metadata.find('title')
4249 if title_el is None:
4250 self._downloader.report_error(u'unable to extract title')
4252 title = title_el.text
4253 format_id_el = metadata.find('format_id')
4254 if format_id_el is None:
4257 format = format_id_el.text
4258 description_el = metadata.find('description')
4259 if description_el is not None:
4260 description = description_el.text
4263 imagePreview_el = metadata.find('imagePreview')
4264 if imagePreview_el is not None:
4265 thumbnail = imagePreview_el.text
4274 'thumbnail': thumbnail,
4275 'description': description
4279 class SpiegelIE(InfoExtractor):
4280 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4282 def _real_extract(self, url):
4283 m = re.match(self._VALID_URL, url)
4284 video_id = m.group('videoID')
4286 webpage = self._download_webpage(url, video_id)
4287 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4289 raise ExtractorError(u'Cannot find title')
4290 video_title = unescapeHTML(m.group(1))
4292 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4293 xml_code = self._download_webpage(xml_url, video_id,
4294 note=u'Downloading XML', errnote=u'Failed to download XML')
4296 idoc = xml.etree.ElementTree.fromstring(xml_code)
4297 last_type = idoc[-1]
4298 filename = last_type.findall('./filename')[0].text
4299 duration = float(last_type.findall('./duration')[0].text)
4301 video_url = 'http://video2.spiegel.de/flash/' + filename
4302 video_ext = filename.rpartition('.')[2]
4307 'title': video_title,
4308 'duration': duration,
4312 class LiveLeakIE(InfoExtractor):
4314 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4315 IE_NAME = u'liveleak'
4317 def _real_extract(self, url):
4318 mobj = re.match(self._VALID_URL, url)
4320 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4323 video_id = mobj.group('video_id')
4325 webpage = self._download_webpage(url, video_id)
4327 m = re.search(r'file: "(.*?)",', webpage)
4329 self._downloader.report_error(u'unable to find video url')
4331 video_url = m.group(1)
4333 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4335 self._downloader.trouble(u'Cannot find video title')
4336 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4338 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4340 desc = unescapeHTML(m.group('desc'))
4344 m = re.search(r'By:.*?(\w+)</a>', webpage)
4346 uploader = clean_html(m.group(1))
4355 'description': desc,
4356 'uploader': uploader
4362 def gen_extractors():
4363 """ Return a list of an instance of every supported extractor.
4364 The order does matter; the first extractor matched is the one handling the URL.
4367 YoutubePlaylistIE(),
4392 StanfordOpenClassroomIE(),
4402 WorldStarHipHopIE(),