2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
139 class YoutubeIE(InfoExtractor):
140 """Information extractor for youtube.com."""
144 (?:https?://)? # http(s):// (optional)
145 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
147 (?:.*?\#/)? # handle anchor (#/) redirect urls
148 (?: # the various things that can precede the ID:
149 (?:(?:v|embed|e)/) # v/ or embed/ or e/
150 |(?: # or the v= param in all its forms
151 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152 (?:\?|\#!?) # the params delimiter ? or # or #!
153 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
156 )? # optional -> youtube.com/xxxx is OK
157 )? # all until now is optional -> you can pass the naked ID
158 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
159 (?(1).+)? # if we found the ID, everything can follow
161 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 _NETRC_MACHINE = 'youtube'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169 _video_extensions = {
175 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
181 _video_dimensions = {
200 def suitable(cls, url):
201 """Receives a URL and returns True if suitable for this IE."""
202 if YoutubePlaylistIE.suitable(url): return False
203 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
205 def report_lang(self):
206 """Report attempt to set language."""
207 self._downloader.to_screen(u'[youtube] Setting language')
209 def report_login(self):
210 """Report attempt to log in."""
211 self._downloader.to_screen(u'[youtube] Logging in')
213 def report_age_confirmation(self):
214 """Report attempt to confirm age."""
215 self._downloader.to_screen(u'[youtube] Confirming age')
217 def report_video_webpage_download(self, video_id):
218 """Report attempt to download video webpage."""
219 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
221 def report_video_info_webpage_download(self, video_id):
222 """Report attempt to download video info webpage."""
223 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
225 def report_video_subtitles_download(self, video_id):
226 """Report attempt to download video info webpage."""
227 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
229 def report_video_subtitles_request(self, video_id, sub_lang, format):
230 """Report attempt to download video info webpage."""
231 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
233 def report_video_subtitles_available(self, video_id, sub_lang_list):
234 """Report available subtitles."""
235 sub_lang = ",".join(list(sub_lang_list.keys()))
236 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
238 def report_information_extraction(self, video_id):
239 """Report attempt to extract video information."""
240 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
242 def report_unavailable_format(self, video_id, format):
243 """Report extracted video URL."""
244 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
246 def report_rtmp_download(self):
247 """Indicate the download will use the RTMP protocol."""
248 self._downloader.to_screen(u'[youtube] RTMP download detected')
250 def _get_available_subtitles(self, video_id):
251 self.report_video_subtitles_download(video_id)
252 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
254 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
256 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
257 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
258 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
259 if not sub_lang_list:
260 return (u'WARNING: video doesn\'t have subtitles', None)
263 def _list_available_subtitles(self, video_id):
264 sub_lang_list = self._get_available_subtitles(video_id)
265 self.report_video_subtitles_available(video_id, sub_lang_list)
267 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
268 self.report_video_subtitles_request(video_id, sub_lang, format)
269 params = compat_urllib_parse.urlencode({
275 url = 'http://www.youtube.com/api/timedtext?' + params
277 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
279 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
281 return (u'WARNING: Did not fetch video subtitles', None)
282 return (None, sub_lang, sub)
284 def _extract_subtitle(self, video_id):
285 sub_lang_list = self._get_available_subtitles(video_id)
286 sub_format = self._downloader.params.get('subtitlesformat')
287 if self._downloader.params.get('subtitleslang', False):
288 sub_lang = self._downloader.params.get('subtitleslang')
289 elif 'en' in sub_lang_list:
292 sub_lang = list(sub_lang_list.keys())[0]
293 if not sub_lang in sub_lang_list:
294 return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
296 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
299 def _extract_all_subtitles(self, video_id):
300 sub_lang_list = self._get_available_subtitles(video_id)
301 sub_format = self._downloader.params.get('subtitlesformat')
303 for sub_lang in sub_lang_list:
304 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
305 subtitles.append(subtitle)
308 def _print_formats(self, formats):
309 print('Available formats:')
311 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
313 def _real_initialize(self):
314 if self._downloader is None:
319 downloader_params = self._downloader.params
321 # Attempt to use provided username and password or .netrc data
322 if downloader_params.get('username', None) is not None:
323 username = downloader_params['username']
324 password = downloader_params['password']
325 elif downloader_params.get('usenetrc', False):
327 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
332 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
333 except (IOError, netrc.NetrcParseError) as err:
334 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
338 request = compat_urllib_request.Request(self._LANG_URL)
341 compat_urllib_request.urlopen(request).read()
342 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
346 # No authentication to be performed
350 request = compat_urllib_request.Request(self._LOGIN_URL)
352 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
354 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
359 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
361 galx = match.group(1)
363 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
369 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
373 u'PersistentCookie': u'yes',
375 u'bgresponse': u'js_disabled',
376 u'checkConnection': u'',
377 u'checkedDomains': u'youtube',
383 u'signIn': u'Sign in',
385 u'service': u'youtube',
389 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
391 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
392 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
393 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
396 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
397 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
398 self._downloader.report_warning(u'unable to log in: bad username or password')
400 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
401 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
407 'action_confirm': 'Confirm',
409 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
411 self.report_age_confirmation()
412 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
413 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
417 def _extract_id(self, url):
418 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
420 self._downloader.report_error(u'invalid URL: %s' % url)
422 video_id = mobj.group(2)
425 def _real_extract(self, url):
426 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
427 mobj = re.search(self._NEXT_URL_RE, url)
429 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
430 video_id = self._extract_id(url)
433 self.report_video_webpage_download(video_id)
434 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
435 request = compat_urllib_request.Request(url)
437 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
438 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
439 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
442 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
444 # Attempt to extract SWF player URL
445 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
447 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
452 self.report_video_info_webpage_download(video_id)
453 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
454 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
455 % (video_id, el_type))
456 request = compat_urllib_request.Request(video_info_url)
458 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
459 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
460 video_info = compat_parse_qs(video_info_webpage)
461 if 'token' in video_info:
463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
464 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
466 if 'token' not in video_info:
467 if 'reason' in video_info:
468 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
470 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
473 # Check for "rental" videos
474 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
475 self._downloader.report_error(u'"rental" videos not supported')
478 # Start extracting information
479 self.report_information_extraction(video_id)
482 if 'author' not in video_info:
483 self._downloader.report_error(u'unable to extract uploader name')
485 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
488 video_uploader_id = None
489 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
491 video_uploader_id = mobj.group(1)
493 self._downloader.report_warning(u'unable to extract uploader nickname')
496 if 'title' not in video_info:
497 self._downloader.report_error(u'unable to extract video title')
499 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
502 if 'thumbnail_url' not in video_info:
503 self._downloader.report_warning(u'unable to extract video thumbnail')
505 else: # don't panic if we can't find it
506 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
510 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
512 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
513 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
514 for expression in format_expressions:
516 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
521 video_description = get_element_by_id("eow-description", video_webpage)
522 if video_description:
523 video_description = clean_html(video_description)
525 video_description = ''
528 video_subtitles = None
530 if self._downloader.params.get('writesubtitles', False):
531 video_subtitles = self._extract_subtitle(video_id)
533 (sub_error, sub_lang, sub) = video_subtitles[0]
535 self._downloader.trouble(sub_error)
537 if self._downloader.params.get('allsubtitles', False):
538 video_subtitles = self._extract_all_subtitles(video_id)
539 for video_subtitle in video_subtitles:
540 (sub_error, sub_lang, sub) = video_subtitle
542 self._downloader.trouble(sub_error)
544 if self._downloader.params.get('listsubtitles', False):
545 sub_lang_list = self._list_available_subtitles(video_id)
548 if 'length_seconds' not in video_info:
549 self._downloader.report_warning(u'unable to extract video duration')
552 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
555 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
557 # Decide which formats to download
558 req_format = self._downloader.params.get('format', None)
560 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
561 self.report_rtmp_download()
562 video_url_list = [(None, video_info['conn'][0])]
563 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
564 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
565 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
566 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
567 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
569 format_limit = self._downloader.params.get('format_limit', None)
570 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
571 if format_limit is not None and format_limit in available_formats:
572 format_list = available_formats[available_formats.index(format_limit):]
574 format_list = available_formats
575 existing_formats = [x for x in format_list if x in url_map]
576 if len(existing_formats) == 0:
577 self._downloader.report_error(u'no known formats available for video')
579 if self._downloader.params.get('listformats', None):
580 self._print_formats(existing_formats)
582 if req_format is None or req_format == 'best':
583 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
584 elif req_format == 'worst':
585 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
586 elif req_format in ('-1', 'all'):
587 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
589 # Specific formats. We pick the first in a slash-delimeted sequence.
590 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
591 req_formats = req_format.split('/')
592 video_url_list = None
593 for rf in req_formats:
595 video_url_list = [(rf, url_map[rf])]
597 if video_url_list is None:
598 self._downloader.report_error(u'requested format not available')
601 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
605 for format_param, video_real_url in video_url_list:
607 video_extension = self._video_extensions.get(format_param, 'flv')
609 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
610 self._video_dimensions.get(format_param, '???'))
614 'url': video_real_url,
615 'uploader': video_uploader,
616 'uploader_id': video_uploader_id,
617 'upload_date': upload_date,
618 'title': video_title,
619 'ext': video_extension,
620 'format': video_format,
621 'thumbnail': video_thumbnail,
622 'description': video_description,
623 'player_url': player_url,
624 'subtitles': video_subtitles,
625 'duration': video_duration
630 class MetacafeIE(InfoExtractor):
631 """Information Extractor for metacafe.com."""
633 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
634 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
635 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
636 IE_NAME = u'metacafe'
638 def __init__(self, downloader=None):
639 InfoExtractor.__init__(self, downloader)
641 def report_disclaimer(self):
642 """Report disclaimer retrieval."""
643 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
645 def report_age_confirmation(self):
646 """Report attempt to confirm age."""
647 self._downloader.to_screen(u'[metacafe] Confirming age')
649 def report_download_webpage(self, video_id):
650 """Report webpage download."""
651 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
653 def report_extraction(self, video_id):
654 """Report information extraction."""
655 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
657 def _real_initialize(self):
658 # Retrieve disclaimer
659 request = compat_urllib_request.Request(self._DISCLAIMER)
661 self.report_disclaimer()
662 disclaimer = compat_urllib_request.urlopen(request).read()
663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
670 'submit': "Continue - I'm over 18",
672 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
674 self.report_age_confirmation()
675 disclaimer = compat_urllib_request.urlopen(request).read()
676 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
677 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
680 def _real_extract(self, url):
681 # Extract id and simplified title from URL
682 mobj = re.match(self._VALID_URL, url)
684 self._downloader.report_error(u'invalid URL: %s' % url)
687 video_id = mobj.group(1)
689 # Check if video comes from YouTube
690 mobj2 = re.match(r'^yt-(.*)$', video_id)
691 if mobj2 is not None:
692 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
695 # Retrieve video webpage to extract further information
696 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
698 self.report_download_webpage(video_id)
699 webpage = compat_urllib_request.urlopen(request).read()
700 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
701 self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
704 # Extract URL, uploader and title from webpage
705 self.report_extraction(video_id)
706 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
708 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
709 video_extension = mediaURL[-3:]
711 # Extract gdaKey if available
712 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
716 gdaKey = mobj.group(1)
717 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
719 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
721 self._downloader.report_error(u'unable to extract media URL')
723 vardict = compat_parse_qs(mobj.group(1))
724 if 'mediaData' not in vardict:
725 self._downloader.report_error(u'unable to extract media URL')
727 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
729 self._downloader.report_error(u'unable to extract media URL')
731 mediaURL = mobj.group(1).replace('\\/', '/')
732 video_extension = mediaURL[-3:]
733 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
735 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
737 self._downloader.report_error(u'unable to extract title')
739 video_title = mobj.group(1).decode('utf-8')
741 mobj = re.search(r'submitter=(.*?);', webpage)
743 self._downloader.report_error(u'unable to extract uploader nickname')
745 video_uploader = mobj.group(1)
748 'id': video_id.decode('utf-8'),
749 'url': video_url.decode('utf-8'),
750 'uploader': video_uploader.decode('utf-8'),
752 'title': video_title,
753 'ext': video_extension.decode('utf-8'),
757 class DailymotionIE(InfoExtractor):
758 """Information Extractor for Dailymotion"""
760 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
761 IE_NAME = u'dailymotion'
764 def __init__(self, downloader=None):
765 InfoExtractor.__init__(self, downloader)
767 def report_extraction(self, video_id):
768 """Report information extraction."""
769 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
771 def _real_extract(self, url):
772 # Extract id and simplified title from URL
773 mobj = re.match(self._VALID_URL, url)
775 self._downloader.report_error(u'invalid URL: %s' % url)
778 video_id = mobj.group(1).split('_')[0].split('?')[0]
780 video_extension = 'mp4'
782 # Retrieve video webpage to extract further information
783 request = compat_urllib_request.Request(url)
784 request.add_header('Cookie', 'family_filter=off')
785 webpage = self._download_webpage(request, video_id)
787 # Extract URL, uploader and title from webpage
788 self.report_extraction(video_id)
789 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
791 self._downloader.report_error(u'unable to extract media URL')
793 flashvars = compat_urllib_parse.unquote(mobj.group(1))
795 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
798 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
801 self._downloader.report_error(u'unable to extract video URL')
804 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
806 self._downloader.report_error(u'unable to extract video URL')
809 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
811 # TODO: support choosing qualities
813 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
815 self._downloader.report_error(u'unable to extract title')
817 video_title = unescapeHTML(mobj.group('title'))
819 video_uploader = None
820 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
822 # lookin for official user
823 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
824 if mobj_official is None:
825 self._downloader.report_warning(u'unable to extract uploader nickname')
827 video_uploader = mobj_official.group(1)
829 video_uploader = mobj.group(1)
831 video_upload_date = None
832 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
834 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
839 'uploader': video_uploader,
840 'upload_date': video_upload_date,
841 'title': video_title,
842 'ext': video_extension,
846 class PhotobucketIE(InfoExtractor):
847 """Information extractor for photobucket.com."""
849 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
850 IE_NAME = u'photobucket'
852 def __init__(self, downloader=None):
853 InfoExtractor.__init__(self, downloader)
855 def report_download_webpage(self, video_id):
856 """Report webpage download."""
857 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
859 def report_extraction(self, video_id):
860 """Report information extraction."""
861 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
863 def _real_extract(self, url):
864 # Extract id from URL
865 mobj = re.match(self._VALID_URL, url)
867 self._downloader.report_error(u'Invalid URL: %s' % url)
870 video_id = mobj.group(1)
872 video_extension = 'flv'
874 # Retrieve video webpage to extract further information
875 request = compat_urllib_request.Request(url)
877 self.report_download_webpage(video_id)
878 webpage = compat_urllib_request.urlopen(request).read()
879 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
880 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
883 # Extract URL, uploader, and title from webpage
884 self.report_extraction(video_id)
885 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
887 self._downloader.report_error(u'unable to extract media URL')
889 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
893 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
895 self._downloader.report_error(u'unable to extract title')
897 video_title = mobj.group(1).decode('utf-8')
899 video_uploader = mobj.group(2).decode('utf-8')
902 'id': video_id.decode('utf-8'),
903 'url': video_url.decode('utf-8'),
904 'uploader': video_uploader,
906 'title': video_title,
907 'ext': video_extension.decode('utf-8'),
911 class YahooIE(InfoExtractor):
912 """Information extractor for video.yahoo.com."""
915 # _VALID_URL matches all Yahoo! Video URLs
916 # _VPAGE_URL matches only the extractable '/watch/' URLs
917 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
918 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
919 IE_NAME = u'video.yahoo'
921 def __init__(self, downloader=None):
922 InfoExtractor.__init__(self, downloader)
924 def report_download_webpage(self, video_id):
925 """Report webpage download."""
926 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
928 def report_extraction(self, video_id):
929 """Report information extraction."""
930 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
932 def _real_extract(self, url, new_video=True):
933 # Extract ID from URL
934 mobj = re.match(self._VALID_URL, url)
936 self._downloader.report_error(u'Invalid URL: %s' % url)
939 video_id = mobj.group(2)
940 video_extension = 'flv'
942 # Rewrite valid but non-extractable URLs as
943 # extractable English language /watch/ URLs
944 if re.match(self._VPAGE_URL, url) is None:
945 request = compat_urllib_request.Request(url)
947 webpage = compat_urllib_request.urlopen(request).read()
948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
952 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
954 self._downloader.report_error(u'Unable to extract id field')
956 yahoo_id = mobj.group(1)
958 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
960 self._downloader.report_error(u'Unable to extract vid field')
962 yahoo_vid = mobj.group(1)
964 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
965 return self._real_extract(url, new_video=False)
967 # Retrieve video webpage to extract further information
968 request = compat_urllib_request.Request(url)
970 self.report_download_webpage(video_id)
971 webpage = compat_urllib_request.urlopen(request).read()
972 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
973 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
976 # Extract uploader and title from webpage
977 self.report_extraction(video_id)
978 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
980 self._downloader.report_error(u'unable to extract video title')
982 video_title = mobj.group(1).decode('utf-8')
984 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
986 self._downloader.report_error(u'unable to extract video uploader')
988 video_uploader = mobj.group(1).decode('utf-8')
990 # Extract video thumbnail
991 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
993 self._downloader.report_error(u'unable to extract video thumbnail')
995 video_thumbnail = mobj.group(1).decode('utf-8')
997 # Extract video description
998 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1000 self._downloader.report_error(u'unable to extract video description')
1002 video_description = mobj.group(1).decode('utf-8')
1003 if not video_description:
1004 video_description = 'No description available.'
1006 # Extract video height and width
1007 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1009 self._downloader.report_error(u'unable to extract video height')
1011 yv_video_height = mobj.group(1)
1013 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1015 self._downloader.report_error(u'unable to extract video width')
1017 yv_video_width = mobj.group(1)
1019 # Retrieve video playlist to extract media URL
1020 # I'm not completely sure what all these options are, but we
1021 # seem to need most of them, otherwise the server sends a 401.
1022 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1023 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1024 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1028 self.report_download_webpage(video_id)
1029 webpage = compat_urllib_request.urlopen(request).read()
1030 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034 # Extract media URL from playlist XML
1035 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1037 self._downloader.report_error(u'Unable to extract media URL')
1039 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040 video_url = unescapeHTML(video_url)
1043 'id': video_id.decode('utf-8'),
1045 'uploader': video_uploader,
1046 'upload_date': None,
1047 'title': video_title,
1048 'ext': video_extension.decode('utf-8'),
1049 'thumbnail': video_thumbnail.decode('utf-8'),
1050 'description': video_description,
1054 class VimeoIE(InfoExtractor):
1055 """Information extractor for vimeo.com."""
1057 # _VALID_URL matches Vimeo URLs
1058 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061 def __init__(self, downloader=None):
1062 InfoExtractor.__init__(self, downloader)
1064 def report_download_webpage(self, video_id):
1065 """Report webpage download."""
1066 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1068 def report_extraction(self, video_id):
1069 """Report information extraction."""
1070 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1072 def _real_extract(self, url, new_video=True):
1073 # Extract ID from URL
1074 mobj = re.match(self._VALID_URL, url)
1076 self._downloader.report_error(u'Invalid URL: %s' % url)
1079 video_id = mobj.group('id')
1080 if not mobj.group('proto'):
1081 url = 'https://' + url
1082 if mobj.group('direct_link'):
1083 url = 'https://vimeo.com/' + video_id
1085 # Retrieve video webpage to extract further information
1086 request = compat_urllib_request.Request(url, None, std_headers)
1088 self.report_download_webpage(video_id)
1089 webpage_bytes = compat_urllib_request.urlopen(request).read()
1090 webpage = webpage_bytes.decode('utf-8')
1091 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1092 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1095 # Now we begin extracting as much information as we can from what we
1096 # retrieved. First we extract the information common to all extractors,
1097 # and latter we extract those that are Vimeo specific.
1098 self.report_extraction(video_id)
1100 # Extract the config JSON
1102 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1103 config = json.loads(config)
1105 self._downloader.report_error(u'unable to extract info section')
1109 video_title = config["video"]["title"]
1111 # Extract uploader and uploader_id
1112 video_uploader = config["video"]["owner"]["name"]
1113 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1115 # Extract video thumbnail
1116 video_thumbnail = config["video"]["thumbnail"]
1118 # Extract video description
1119 video_description = get_element_by_attribute("itemprop", "description", webpage)
1120 if video_description: video_description = clean_html(video_description)
1121 else: video_description = ''
1123 # Extract upload date
1124 video_upload_date = None
1125 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1126 if mobj is not None:
1127 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1129 # Vimeo specific: extract request signature and timestamp
1130 sig = config['request']['signature']
1131 timestamp = config['request']['timestamp']
1133 # Vimeo specific: extract video codec and quality information
1134 # First consider quality, then codecs, then take everything
1135 # TODO bind to format param
1136 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1137 files = { 'hd': [], 'sd': [], 'other': []}
1138 for codec_name, codec_extension in codecs:
1139 if codec_name in config["video"]["files"]:
1140 if 'hd' in config["video"]["files"][codec_name]:
1141 files['hd'].append((codec_name, codec_extension, 'hd'))
1142 elif 'sd' in config["video"]["files"][codec_name]:
1143 files['sd'].append((codec_name, codec_extension, 'sd'))
1145 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1147 for quality in ('hd', 'sd', 'other'):
1148 if len(files[quality]) > 0:
1149 video_quality = files[quality][0][2]
1150 video_codec = files[quality][0][0]
1151 video_extension = files[quality][0][1]
1152 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1155 self._downloader.report_error(u'no known codec found')
1158 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1159 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1164 'uploader': video_uploader,
1165 'uploader_id': video_uploader_id,
1166 'upload_date': video_upload_date,
1167 'title': video_title,
1168 'ext': video_extension,
1169 'thumbnail': video_thumbnail,
1170 'description': video_description,
1174 class ArteTvIE(InfoExtractor):
1175 """arte.tv information extractor."""
1177 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1178 _LIVE_URL = r'index-[0-9]+\.html$'
1180 IE_NAME = u'arte.tv'
1182 def __init__(self, downloader=None):
1183 InfoExtractor.__init__(self, downloader)
1185 def report_download_webpage(self, video_id):
1186 """Report webpage download."""
1187 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1189 def report_extraction(self, video_id):
1190 """Report information extraction."""
1191 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1193 def fetch_webpage(self, url):
1194 request = compat_urllib_request.Request(url)
1196 self.report_download_webpage(url)
1197 webpage = compat_urllib_request.urlopen(request).read()
1198 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1199 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1201 except ValueError as err:
1202 self._downloader.report_error(u'Invalid URL: %s' % url)
1206 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1207 page = self.fetch_webpage(url)
1208 mobj = re.search(regex, page, regexFlags)
1212 self._downloader.report_error(u'Invalid URL: %s' % url)
1215 for (i, key, err) in matchTuples:
1216 if mobj.group(i) is None:
1217 self._downloader.trouble(err)
1220 info[key] = mobj.group(i)
1224 def extractLiveStream(self, url):
1225 video_lang = url.split('/')[-4]
1226 info = self.grep_webpage(
1228 r'src="(.*?/videothek_js.*?\.js)',
1231 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1234 http_host = url.split('/')[2]
1235 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236 info = self.grep_webpage(
1238 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239 '(http://.*?\.swf).*?' +
1243 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1244 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1245 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1248 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1250 def extractPlus7Stream(self, url):
1251 video_lang = url.split('/')[-3]
1252 info = self.grep_webpage(
1254 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1257 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1260 next_url = compat_urllib_parse.unquote(info.get('url'))
1261 info = self.grep_webpage(
1263 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1266 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1269 next_url = compat_urllib_parse.unquote(info.get('url'))
1271 info = self.grep_webpage(
1273 r'<video id="(.*?)".*?>.*?' +
1274 '<name>(.*?)</name>.*?' +
1275 '<dateVideo>(.*?)</dateVideo>.*?' +
1276 '<url quality="hd">(.*?)</url>',
1279 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1280 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1281 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1282 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1287 'id': info.get('id'),
1288 'url': compat_urllib_parse.unquote(info.get('url')),
1289 'uploader': u'arte.tv',
1290 'upload_date': info.get('date'),
1291 'title': info.get('title').decode('utf-8'),
1297 def _real_extract(self, url):
1298 video_id = url.split('/')[-1]
1299 self.report_extraction(video_id)
1301 if re.search(self._LIVE_URL, video_id) is not None:
1302 self.extractLiveStream(url)
1305 info = self.extractPlus7Stream(url)
1310 class GenericIE(InfoExtractor):
1311 """Generic last-resort information extractor."""
1314 IE_NAME = u'generic'
1316 def __init__(self, downloader=None):
1317 InfoExtractor.__init__(self, downloader)
1319 def report_download_webpage(self, video_id):
1320 """Report webpage download."""
1321 if not self._downloader.params.get('test', False):
1322 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1323 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1325 def report_extraction(self, video_id):
1326 """Report information extraction."""
1327 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1329 def report_following_redirect(self, new_url):
1330 """Report information extraction."""
1331 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1333 def _test_redirect(self, url):
1334 """Check if it is a redirect, like url shorteners, in case restart chain."""
1335 class HeadRequest(compat_urllib_request.Request):
1336 def get_method(self):
1339 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1341 Subclass the HTTPRedirectHandler to make it use our
1342 HeadRequest also on the redirected URL
1344 def redirect_request(self, req, fp, code, msg, headers, newurl):
1345 if code in (301, 302, 303, 307):
1346 newurl = newurl.replace(' ', '%20')
1347 newheaders = dict((k,v) for k,v in req.headers.items()
1348 if k.lower() not in ("content-length", "content-type"))
1349 return HeadRequest(newurl,
1351 origin_req_host=req.get_origin_req_host(),
1354 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1356 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1358 Fallback to GET if HEAD is not allowed (405 HTTP error)
1360 def http_error_405(self, req, fp, code, msg, headers):
1364 newheaders = dict((k,v) for k,v in req.headers.items()
1365 if k.lower() not in ("content-length", "content-type"))
1366 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1368 origin_req_host=req.get_origin_req_host(),
1372 opener = compat_urllib_request.OpenerDirector()
1373 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1374 HTTPMethodFallback, HEADRedirectHandler,
1375 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1376 opener.add_handler(handler())
1378 response = opener.open(HeadRequest(url))
1379 new_url = response.geturl()
1384 self.report_following_redirect(new_url)
1385 self._downloader.download([new_url])
1388 def _real_extract(self, url):
1389 if self._test_redirect(url): return
1391 video_id = url.split('/')[-1]
1393 webpage = self._download_webpage(url, video_id)
1394 except ValueError as err:
1395 # since this is the last-resort InfoExtractor, if
1396 # this error is thrown, it'll be thrown here
1397 self._downloader.report_error(u'Invalid URL: %s' % url)
1400 self.report_extraction(video_id)
1401 # Start with something easy: JW Player in SWFObject
1402 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1404 # Broaden the search a little bit
1405 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1407 # Broaden the search a little bit: JWPlayer JS loader
1408 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1410 self._downloader.report_error(u'Invalid URL: %s' % url)
1413 # It's possible that one of the regexes
1414 # matched, but returned an empty group:
1415 if mobj.group(1) is None:
1416 self._downloader.report_error(u'Invalid URL: %s' % url)
1419 video_url = compat_urllib_parse.unquote(mobj.group(1))
1420 video_id = os.path.basename(video_url)
1422 # here's a fun little line of code for you:
1423 video_extension = os.path.splitext(video_id)[1][1:]
1424 video_id = os.path.splitext(video_id)[0]
1426 # it's tempting to parse this further, but you would
1427 # have to take into account all the variations like
1428 # Video Title - Site Name
1429 # Site Name | Video Title
1430 # Video Title - Tagline | Site Name
1431 # and so on and so forth; it's just not practical
1432 mobj = re.search(r'<title>(.*)</title>', webpage)
1434 self._downloader.report_error(u'unable to extract title')
1436 video_title = mobj.group(1)
1438 # video uploader is domain name
1439 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1441 self._downloader.report_error(u'unable to extract title')
1443 video_uploader = mobj.group(1)
1448 'uploader': video_uploader,
1449 'upload_date': None,
1450 'title': video_title,
1451 'ext': video_extension,
1455 class YoutubeSearchIE(InfoExtractor):
1456 """Information Extractor for YouTube search queries."""
1457 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1458 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1459 _max_youtube_results = 1000
1460 IE_NAME = u'youtube:search'
1462 def __init__(self, downloader=None):
1463 InfoExtractor.__init__(self, downloader)
1465 def report_download_page(self, query, pagenum):
1466 """Report attempt to download search page with given number."""
1467 query = query.decode(preferredencoding())
1468 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1470 def _real_extract(self, query):
1471 mobj = re.match(self._VALID_URL, query)
1473 self._downloader.report_error(u'invalid search query "%s"' % query)
1476 prefix, query = query.split(':')
1478 query = query.encode('utf-8')
1480 self._download_n_results(query, 1)
1482 elif prefix == 'all':
1483 self._download_n_results(query, self._max_youtube_results)
1489 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1491 elif n > self._max_youtube_results:
1492 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1493 n = self._max_youtube_results
1494 self._download_n_results(query, n)
1496 except ValueError: # parsing prefix as integer fails
1497 self._download_n_results(query, 1)
1500 def _download_n_results(self, query, n):
1501 """Downloads a specified number of results for a query"""
1507 while (50 * pagenum) < limit:
1508 self.report_download_page(query, pagenum+1)
1509 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1510 request = compat_urllib_request.Request(result_url)
1512 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1514 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1516 api_response = json.loads(data)['data']
1518 if not 'items' in api_response:
1519 self._downloader.trouble(u'[youtube] No video results')
1522 new_ids = list(video['id'] for video in api_response['items'])
1523 video_ids += new_ids
1525 limit = min(n, api_response['totalItems'])
1528 if len(video_ids) > n:
1529 video_ids = video_ids[:n]
1530 for id in video_ids:
1531 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1535 class GoogleSearchIE(InfoExtractor):
1536 """Information Extractor for Google Video search queries."""
1537 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1538 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1539 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1540 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1541 _max_google_results = 1000
1542 IE_NAME = u'video.google:search'
1544 def __init__(self, downloader=None):
1545 InfoExtractor.__init__(self, downloader)
1547 def report_download_page(self, query, pagenum):
1548 """Report attempt to download playlist page with given number."""
1549 query = query.decode(preferredencoding())
1550 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1552 def _real_extract(self, query):
1553 mobj = re.match(self._VALID_URL, query)
1555 self._downloader.report_error(u'invalid search query "%s"' % query)
1558 prefix, query = query.split(':')
1560 query = query.encode('utf-8')
1562 self._download_n_results(query, 1)
1564 elif prefix == 'all':
1565 self._download_n_results(query, self._max_google_results)
1571 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1573 elif n > self._max_google_results:
1574 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1575 n = self._max_google_results
1576 self._download_n_results(query, n)
1578 except ValueError: # parsing prefix as integer fails
1579 self._download_n_results(query, 1)
1582 def _download_n_results(self, query, n):
1583 """Downloads a specified number of results for a query"""
1589 self.report_download_page(query, pagenum)
1590 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1591 request = compat_urllib_request.Request(result_url)
1593 page = compat_urllib_request.urlopen(request).read()
1594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1595 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1598 # Extract video identifiers
1599 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1600 video_id = mobj.group(1)
1601 if video_id not in video_ids:
1602 video_ids.append(video_id)
1603 if len(video_ids) == n:
1604 # Specified n videos reached
1605 for id in video_ids:
1606 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1609 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610 for id in video_ids:
1611 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1614 pagenum = pagenum + 1
1617 class YahooSearchIE(InfoExtractor):
1618 """Information Extractor for Yahoo! Video search queries."""
1621 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1622 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1623 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1624 _MORE_PAGES_INDICATOR = r'\s*Next'
1625 _max_yahoo_results = 1000
1626 IE_NAME = u'video.yahoo:search'
1628 def __init__(self, downloader=None):
1629 InfoExtractor.__init__(self, downloader)
1631 def report_download_page(self, query, pagenum):
1632 """Report attempt to download playlist page with given number."""
1633 query = query.decode(preferredencoding())
1634 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1636 def _real_extract(self, query):
1637 mobj = re.match(self._VALID_URL, query)
1639 self._downloader.report_error(u'invalid search query "%s"' % query)
1642 prefix, query = query.split(':')
1644 query = query.encode('utf-8')
1646 self._download_n_results(query, 1)
1648 elif prefix == 'all':
1649 self._download_n_results(query, self._max_yahoo_results)
1655 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1657 elif n > self._max_yahoo_results:
1658 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1659 n = self._max_yahoo_results
1660 self._download_n_results(query, n)
1662 except ValueError: # parsing prefix as integer fails
1663 self._download_n_results(query, 1)
1666 def _download_n_results(self, query, n):
1667 """Downloads a specified number of results for a query"""
1670 already_seen = set()
1674 self.report_download_page(query, pagenum)
1675 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1676 request = compat_urllib_request.Request(result_url)
1678 page = compat_urllib_request.urlopen(request).read()
1679 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1680 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1683 # Extract video identifiers
1684 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1685 video_id = mobj.group(1)
1686 if video_id not in already_seen:
1687 video_ids.append(video_id)
1688 already_seen.add(video_id)
1689 if len(video_ids) == n:
1690 # Specified n videos reached
1691 for id in video_ids:
1692 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1695 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1696 for id in video_ids:
1697 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1700 pagenum = pagenum + 1
1703 class YoutubePlaylistIE(InfoExtractor):
1704 """Information Extractor for YouTube playlists."""
1706 _VALID_URL = r"""(?:
1711 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1712 \? (?:.*?&)*? (?:p|a|list)=
1717 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1720 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1722 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1724 IE_NAME = u'youtube:playlist'
1726 def __init__(self, downloader=None):
1727 InfoExtractor.__init__(self, downloader)
1730 def suitable(cls, url):
1731 """Receives a URL and returns True if suitable for this IE."""
1732 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1734 def report_download_page(self, playlist_id, pagenum):
1735 """Report attempt to download playlist page with given number."""
1736 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1738 def _real_extract(self, url):
1739 # Extract playlist id
1740 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1742 self._downloader.report_error(u'invalid url: %s' % url)
1745 # Download playlist videos from API
1746 playlist_id = mobj.group(1) or mobj.group(2)
1751 self.report_download_page(playlist_id, page_num)
1753 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1755 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1756 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1757 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1761 response = json.loads(page)
1762 except ValueError as err:
1763 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1766 if not 'feed' in response or not 'entry' in response['feed']:
1767 self._downloader.report_error(u'Got a malformed response from YouTube API')
1769 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1770 for entry in response['feed']['entry']
1771 if 'content' in entry ]
1773 if len(response['feed']['entry']) < self._MAX_RESULTS:
1777 videos = [v[1] for v in sorted(videos)]
1780 playliststart = self._downloader.params.get('playliststart', 1) - 1
1781 playlistend = self._downloader.params.get('playlistend', -1)
1782 if playlistend == -1:
1783 videos = videos[playliststart:]
1785 videos = videos[playliststart:playlistend]
1787 if len(videos) == total:
1788 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1790 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1792 for video in videos:
1793 self._downloader.download([video])
1797 class YoutubeChannelIE(InfoExtractor):
1798 """Information Extractor for YouTube channels."""
1800 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1801 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1802 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1803 IE_NAME = u'youtube:channel'
1805 def report_download_page(self, channel_id, pagenum):
1806 """Report attempt to download channel page with given number."""
1807 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1809 def _real_extract(self, url):
1810 # Extract channel id
1811 mobj = re.match(self._VALID_URL, url)
1813 self._downloader.report_error(u'invalid url: %s' % url)
1816 # Download channel pages
1817 channel_id = mobj.group(1)
1822 self.report_download_page(channel_id, pagenum)
1823 url = self._TEMPLATE_URL % (channel_id, pagenum)
1824 request = compat_urllib_request.Request(url)
1826 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1831 # Extract video identifiers
1833 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1834 if mobj.group(1) not in ids_in_page:
1835 ids_in_page.append(mobj.group(1))
1836 video_ids.extend(ids_in_page)
1838 if self._MORE_PAGES_INDICATOR not in page:
1840 pagenum = pagenum + 1
1842 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1844 for id in video_ids:
1845 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1849 class YoutubeUserIE(InfoExtractor):
1850 """Information Extractor for YouTube users."""
1852 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1853 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1854 _GDATA_PAGE_SIZE = 50
1855 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1856 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1857 IE_NAME = u'youtube:user'
1859 def __init__(self, downloader=None):
1860 InfoExtractor.__init__(self, downloader)
1862 def report_download_page(self, username, start_index):
1863 """Report attempt to download user page."""
1864 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1865 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1867 def _real_extract(self, url):
1869 mobj = re.match(self._VALID_URL, url)
1871 self._downloader.report_error(u'invalid url: %s' % url)
1874 username = mobj.group(1)
1876 # Download video ids using YouTube Data API. Result size per
1877 # query is limited (currently to 50 videos) so we need to query
1878 # page by page until there are no video ids - it means we got
1885 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1886 self.report_download_page(username, start_index)
1888 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1891 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1893 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1896 # Extract video identifiers
1899 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1900 if mobj.group(1) not in ids_in_page:
1901 ids_in_page.append(mobj.group(1))
1903 video_ids.extend(ids_in_page)
1905 # A little optimization - if current page is not
1906 # "full", ie. does not contain PAGE_SIZE video ids then
1907 # we can assume that this page is the last one - there
1908 # are no more ids on further pages - no need to query
1911 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1916 all_ids_count = len(video_ids)
1917 playliststart = self._downloader.params.get('playliststart', 1) - 1
1918 playlistend = self._downloader.params.get('playlistend', -1)
1920 if playlistend == -1:
1921 video_ids = video_ids[playliststart:]
1923 video_ids = video_ids[playliststart:playlistend]
1925 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1926 (username, all_ids_count, len(video_ids)))
1928 for video_id in video_ids:
1929 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1932 class BlipTVUserIE(InfoExtractor):
1933 """Information Extractor for blip.tv users."""
1935 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1937 IE_NAME = u'blip.tv:user'
1939 def __init__(self, downloader=None):
1940 InfoExtractor.__init__(self, downloader)
1942 def report_download_page(self, username, pagenum):
1943 """Report attempt to download user page."""
1944 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1945 (self.IE_NAME, username, pagenum))
1947 def _real_extract(self, url):
1949 mobj = re.match(self._VALID_URL, url)
1951 self._downloader.report_error(u'invalid url: %s' % url)
1954 username = mobj.group(1)
1956 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1958 request = compat_urllib_request.Request(url)
1961 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1962 mobj = re.search(r'data-users-id="([^"]+)"', page)
1963 page_base = page_base % mobj.group(1)
1964 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1965 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1969 # Download video ids using BlipTV Ajax calls. Result size per
1970 # query is limited (currently to 12 videos) so we need to query
1971 # page by page until there are no video ids - it means we got
1978 self.report_download_page(username, pagenum)
1979 url = page_base + "&page=" + str(pagenum)
1980 request = compat_urllib_request.Request( url )
1982 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1983 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1984 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1987 # Extract video identifiers
1990 for mobj in re.finditer(r'href="/([^"]+)"', page):
1991 if mobj.group(1) not in ids_in_page:
1992 ids_in_page.append(unescapeHTML(mobj.group(1)))
1994 video_ids.extend(ids_in_page)
1996 # A little optimization - if current page is not
1997 # "full", ie. does not contain PAGE_SIZE video ids then
1998 # we can assume that this page is the last one - there
1999 # are no more ids on further pages - no need to query
2002 if len(ids_in_page) < self._PAGE_SIZE:
2007 all_ids_count = len(video_ids)
2008 playliststart = self._downloader.params.get('playliststart', 1) - 1
2009 playlistend = self._downloader.params.get('playlistend', -1)
2011 if playlistend == -1:
2012 video_ids = video_ids[playliststart:]
2014 video_ids = video_ids[playliststart:playlistend]
2016 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2017 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2019 for video_id in video_ids:
2020 self._downloader.download([u'http://blip.tv/'+video_id])
2023 class DepositFilesIE(InfoExtractor):
2024 """Information extractor for depositfiles.com"""
2026 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2028 def report_download_webpage(self, file_id):
2029 """Report webpage download."""
2030 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2032 def report_extraction(self, file_id):
2033 """Report information extraction."""
2034 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2036 def _real_extract(self, url):
2037 file_id = url.split('/')[-1]
2038 # Rebuild url in english locale
2039 url = 'http://depositfiles.com/en/files/' + file_id
2041 # Retrieve file webpage with 'Free download' button pressed
2042 free_download_indication = { 'gateway_result' : '1' }
2043 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2045 self.report_download_webpage(file_id)
2046 webpage = compat_urllib_request.urlopen(request).read()
2047 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2048 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2051 # Search for the real file URL
2052 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2053 if (mobj is None) or (mobj.group(1) is None):
2054 # Try to figure out reason of the error.
2055 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2056 if (mobj is not None) and (mobj.group(1) is not None):
2057 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2058 self._downloader.report_error(u'%s' % restriction_message)
2060 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2063 file_url = mobj.group(1)
2064 file_extension = os.path.splitext(file_url)[1][1:]
2066 # Search for file title
2067 mobj = re.search(r'<b title="(.*?)">', webpage)
2069 self._downloader.report_error(u'unable to extract title')
2071 file_title = mobj.group(1).decode('utf-8')
2074 'id': file_id.decode('utf-8'),
2075 'url': file_url.decode('utf-8'),
2077 'upload_date': None,
2078 'title': file_title,
2079 'ext': file_extension.decode('utf-8'),
2083 class FacebookIE(InfoExtractor):
2084 """Information Extractor for Facebook"""
2086 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2087 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2088 _NETRC_MACHINE = 'facebook'
2089 IE_NAME = u'facebook'
2091 def report_login(self):
2092 """Report attempt to log in."""
2093 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2095 def _real_initialize(self):
2096 if self._downloader is None:
2101 downloader_params = self._downloader.params
2103 # Attempt to use provided username and password or .netrc data
2104 if downloader_params.get('username', None) is not None:
2105 useremail = downloader_params['username']
2106 password = downloader_params['password']
2107 elif downloader_params.get('usenetrc', False):
2109 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2110 if info is not None:
2114 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2115 except (IOError, netrc.NetrcParseError) as err:
2116 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2119 if useremail is None:
2128 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2131 login_results = compat_urllib_request.urlopen(request).read()
2132 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2133 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2135 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2136 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2139 def _real_extract(self, url):
2140 mobj = re.match(self._VALID_URL, url)
2142 self._downloader.report_error(u'invalid URL: %s' % url)
2144 video_id = mobj.group('ID')
2146 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2147 webpage = self._download_webpage(url, video_id)
2149 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2150 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2151 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2153 raise ExtractorError(u'Cannot parse data')
2154 data = dict(json.loads(m.group(1)))
2155 params_raw = compat_urllib_parse.unquote(data['params'])
2156 params = json.loads(params_raw)
2157 video_url = params['hd_src']
2159 video_url = params['sd_src']
2161 raise ExtractorError(u'Cannot find video URL')
2162 video_duration = int(params['video_duration'])
2164 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2166 raise ExtractorError(u'Cannot find title in webpage')
2167 video_title = unescapeHTML(m.group(1))
2171 'title': video_title,
2174 'duration': video_duration,
2175 'thumbnail': params['thumbnail_src'],
2180 class BlipTVIE(InfoExtractor):
2181 """Information extractor for blip.tv"""
2183 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185 IE_NAME = u'blip.tv'
2187 def report_extraction(self, file_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2191 def report_direct_download(self, title):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2195 def _real_extract(self, url):
2196 mobj = re.match(self._VALID_URL, url)
2198 self._downloader.report_error(u'invalid URL: %s' % url)
2201 urlp = compat_urllib_parse_urlparse(url)
2202 if urlp.path.startswith('/play/'):
2203 request = compat_urllib_request.Request(url)
2204 response = compat_urllib_request.urlopen(request)
2205 redirecturl = response.geturl()
2206 rurlp = compat_urllib_parse_urlparse(redirecturl)
2207 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2208 url = 'http://blip.tv/a/a-' + file_id
2209 return self._real_extract(url)
2216 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2217 request = compat_urllib_request.Request(json_url)
2218 request.add_header('User-Agent', 'iTunes/10.6.1')
2219 self.report_extraction(mobj.group(1))
2222 urlh = compat_urllib_request.urlopen(request)
2223 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2224 basename = url.split('/')[-1]
2225 title,ext = os.path.splitext(basename)
2226 title = title.decode('UTF-8')
2227 ext = ext.replace('.', '')
2228 self.report_direct_download(title)
2233 'upload_date': None,
2238 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2239 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2240 if info is None: # Regular URL
2242 json_code_bytes = urlh.read()
2243 json_code = json_code_bytes.decode('utf-8')
2244 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2245 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2249 json_data = json.loads(json_code)
2250 if 'Post' in json_data:
2251 data = json_data['Post']
2255 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2256 video_url = data['media']['url']
2257 umobj = re.match(self._URL_EXT, video_url)
2259 raise ValueError('Can not determine filename extension')
2260 ext = umobj.group(1)
2263 'id': data['item_id'],
2265 'uploader': data['display_name'],
2266 'upload_date': upload_date,
2267 'title': data['title'],
2269 'format': data['media']['mimeType'],
2270 'thumbnail': data['thumbnailUrl'],
2271 'description': data['description'],
2272 'player_url': data['embedUrl'],
2273 'user_agent': 'iTunes/10.6.1',
2275 except (ValueError,KeyError) as err:
2276 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2282 class MyVideoIE(InfoExtractor):
2283 """Information Extractor for myvideo.de."""
2285 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2286 IE_NAME = u'myvideo'
2288 def __init__(self, downloader=None):
2289 InfoExtractor.__init__(self, downloader)
2291 def report_extraction(self, video_id):
2292 """Report information extraction."""
2293 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2295 def _real_extract(self,url):
2296 mobj = re.match(self._VALID_URL, url)
2298 self._download.report_error(u'invalid URL: %s' % url)
2301 video_id = mobj.group(1)
2304 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2305 webpage = self._download_webpage(webpage_url, video_id)
2307 self.report_extraction(video_id)
2308 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2311 self._downloader.report_error(u'unable to extract media URL')
2313 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2315 mobj = re.search('<title>([^<]+)</title>', webpage)
2317 self._downloader.report_error(u'unable to extract title')
2320 video_title = mobj.group(1)
2326 'upload_date': None,
2327 'title': video_title,
2331 class ComedyCentralIE(InfoExtractor):
2332 """Information extractor for The Daily Show and Colbert Report """
2334 # urls can be abbreviations like :thedailyshow or :colbert
2335 # urls for episodes like:
2336 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2337 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2338 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2339 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2340 |(https?://)?(www\.)?
2341 (?P<showname>thedailyshow|colbertnation)\.com/
2342 (full-episodes/(?P<episode>.*)|
2344 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2345 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2348 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2350 _video_extensions = {
2358 _video_dimensions = {
2368 def suitable(cls, url):
2369 """Receives a URL and returns True if suitable for this IE."""
2370 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2372 def report_extraction(self, episode_id):
2373 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2375 def report_config_download(self, episode_id, media_id):
2376 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2378 def report_index_download(self, episode_id):
2379 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2381 def _print_formats(self, formats):
2382 print('Available formats:')
2384 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2387 def _real_extract(self, url):
2388 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2390 self._downloader.report_error(u'invalid URL: %s' % url)
2393 if mobj.group('shortname'):
2394 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2395 url = u'http://www.thedailyshow.com/full-episodes/'
2397 url = u'http://www.colbertnation.com/full-episodes/'
2398 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2399 assert mobj is not None
2401 if mobj.group('clip'):
2402 if mobj.group('showname') == 'thedailyshow':
2403 epTitle = mobj.group('tdstitle')
2405 epTitle = mobj.group('cntitle')
2408 dlNewest = not mobj.group('episode')
2410 epTitle = mobj.group('showname')
2412 epTitle = mobj.group('episode')
2414 req = compat_urllib_request.Request(url)
2415 self.report_extraction(epTitle)
2417 htmlHandle = compat_urllib_request.urlopen(req)
2418 html = htmlHandle.read()
2419 webpage = html.decode('utf-8')
2420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2421 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2424 url = htmlHandle.geturl()
2425 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2427 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2429 if mobj.group('episode') == '':
2430 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2432 epTitle = mobj.group('episode')
2434 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2436 if len(mMovieParams) == 0:
2437 # The Colbert Report embeds the information in a without
2438 # a URL prefix; so extract the alternate reference
2439 # and then add the URL prefix manually.
2441 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2442 if len(altMovieParams) == 0:
2443 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2446 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2448 uri = mMovieParams[0][1]
2449 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2450 self.report_index_download(epTitle)
2452 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2459 idoc = xml.etree.ElementTree.fromstring(indexXml)
2460 itemEls = idoc.findall('.//item')
2461 for partNum,itemEl in enumerate(itemEls):
2462 mediaId = itemEl.findall('./guid')[0].text
2463 shortMediaId = mediaId.split(':')[-1]
2464 showId = mediaId.split(':')[-2].replace('.com', '')
2465 officialTitle = itemEl.findall('./title')[0].text
2466 officialDate = itemEl.findall('./pubDate')[0].text
2468 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2469 compat_urllib_parse.urlencode({'uri': mediaId}))
2470 configReq = compat_urllib_request.Request(configUrl)
2471 self.report_config_download(epTitle, shortMediaId)
2473 configXml = compat_urllib_request.urlopen(configReq).read()
2474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2475 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2478 cdoc = xml.etree.ElementTree.fromstring(configXml)
2480 for rendition in cdoc.findall('.//rendition'):
2481 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2485 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2488 if self._downloader.params.get('listformats', None):
2489 self._print_formats([i[0] for i in turls])
2492 # For now, just pick the highest bitrate
2493 format,rtmp_video_url = turls[-1]
2495 # Get the format arg from the arg stream
2496 req_format = self._downloader.params.get('format', None)
2498 # Select format if we can find one
2501 format, rtmp_video_url = f, v
2504 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2506 raise ExtractorError(u'Cannot transform RTMP url')
2507 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2508 video_url = base + m.group('finalid')
2510 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2515 'upload_date': officialDate,
2520 'description': officialTitle,
2522 results.append(info)
2527 class EscapistIE(InfoExtractor):
2528 """Information extractor for The Escapist """
2530 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2531 IE_NAME = u'escapist'
2533 def report_extraction(self, showName):
2534 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2536 def report_config_download(self, showName):
2537 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2539 def _real_extract(self, url):
2540 mobj = re.match(self._VALID_URL, url)
2542 self._downloader.report_error(u'invalid URL: %s' % url)
2544 showName = mobj.group('showname')
2545 videoId = mobj.group('episode')
2547 self.report_extraction(showName)
2549 webPage = compat_urllib_request.urlopen(url)
2550 webPageBytes = webPage.read()
2551 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2552 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2557 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2558 description = unescapeHTML(descMatch.group(1))
2559 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2560 imgUrl = unescapeHTML(imgMatch.group(1))
2561 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2562 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2563 configUrlMatch = re.search('config=(.*)$', playerUrl)
2564 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2566 self.report_config_download(showName)
2568 configJSON = compat_urllib_request.urlopen(configUrl)
2569 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2570 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2571 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2572 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2575 # Technically, it's JavaScript, not JSON
2576 configJSON = configJSON.replace("'", '"')
2579 config = json.loads(configJSON)
2580 except (ValueError,) as err:
2581 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2584 playlist = config['playlist']
2585 videoUrl = playlist[1]['url']
2590 'uploader': showName,
2591 'upload_date': None,
2594 'thumbnail': imgUrl,
2595 'description': description,
2596 'player_url': playerUrl,
2601 class CollegeHumorIE(InfoExtractor):
2602 """Information extractor for collegehumor.com"""
2605 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2606 IE_NAME = u'collegehumor'
2608 def report_manifest(self, video_id):
2609 """Report information extraction."""
2610 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2612 def report_extraction(self, video_id):
2613 """Report information extraction."""
2614 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2616 def _real_extract(self, url):
2617 mobj = re.match(self._VALID_URL, url)
2619 self._downloader.report_error(u'invalid URL: %s' % url)
2621 video_id = mobj.group('videoid')
2626 'upload_date': None,
2629 self.report_extraction(video_id)
2630 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2632 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2633 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2634 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2637 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2639 videoNode = mdoc.findall('./video')[0]
2640 info['description'] = videoNode.findall('./description')[0].text
2641 info['title'] = videoNode.findall('./caption')[0].text
2642 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2643 manifest_url = videoNode.findall('./file')[0].text
2645 self._downloader.report_error(u'Invalid metadata XML file')
2648 manifest_url += '?hdcore=2.10.3'
2649 self.report_manifest(video_id)
2651 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2652 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2653 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2656 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2658 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2659 node_id = media_node.attrib['url']
2660 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2661 except IndexError as err:
2662 self._downloader.report_error(u'Invalid manifest file')
2665 url_pr = compat_urllib_parse_urlparse(manifest_url)
2666 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2673 class XVideosIE(InfoExtractor):
2674 """Information extractor for xvideos.com"""
2676 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2677 IE_NAME = u'xvideos'
2679 def report_extraction(self, video_id):
2680 """Report information extraction."""
2681 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2683 def _real_extract(self, url):
2684 mobj = re.match(self._VALID_URL, url)
2686 self._downloader.report_error(u'invalid URL: %s' % url)
2688 video_id = mobj.group(1)
2690 webpage = self._download_webpage(url, video_id)
2692 self.report_extraction(video_id)
2696 mobj = re.search(r'flv_url=(.+?)&', webpage)
2698 self._downloader.report_error(u'unable to extract video url')
2700 video_url = compat_urllib_parse.unquote(mobj.group(1))
2704 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2706 self._downloader.report_error(u'unable to extract video title')
2708 video_title = mobj.group(1)
2711 # Extract video thumbnail
2712 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2714 self._downloader.report_error(u'unable to extract video thumbnail')
2716 video_thumbnail = mobj.group(0)
2722 'upload_date': None,
2723 'title': video_title,
2725 'thumbnail': video_thumbnail,
2726 'description': None,
2732 class SoundcloudIE(InfoExtractor):
2733 """Information extractor for soundcloud.com
2734 To access the media, the uid of the song and a stream token
2735 must be extracted from the page source and the script must make
2736 a request to media.soundcloud.com/crossdomain.xml. Then
2737 the media can be grabbed by requesting from an url composed
2738 of the stream token and uid
2741 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2742 IE_NAME = u'soundcloud'
2744 def __init__(self, downloader=None):
2745 InfoExtractor.__init__(self, downloader)
2747 def report_resolve(self, video_id):
2748 """Report information extraction."""
2749 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2751 def report_extraction(self, video_id):
2752 """Report information extraction."""
2753 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2755 def _real_extract(self, url):
2756 mobj = re.match(self._VALID_URL, url)
2758 self._downloader.report_error(u'invalid URL: %s' % url)
2761 # extract uploader (which is in the url)
2762 uploader = mobj.group(1)
2763 # extract simple title (uploader + slug of song title)
2764 slug_title = mobj.group(2)
2765 simple_title = uploader + u'-' + slug_title
2767 self.report_resolve('%s/%s' % (uploader, slug_title))
2769 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2770 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2771 request = compat_urllib_request.Request(resolv_url)
2773 info_json_bytes = compat_urllib_request.urlopen(request).read()
2774 info_json = info_json_bytes.decode('utf-8')
2775 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2776 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2779 info = json.loads(info_json)
2780 video_id = info['id']
2781 self.report_extraction('%s/%s' % (uploader, slug_title))
2783 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2784 request = compat_urllib_request.Request(streams_url)
2786 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2787 stream_json = stream_json_bytes.decode('utf-8')
2788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2789 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2792 streams = json.loads(stream_json)
2793 mediaURL = streams['http_mp3_128_url']
2798 'uploader': info['user']['username'],
2799 'upload_date': info['created_at'],
2800 'title': info['title'],
2802 'description': info['description'],
2805 class SoundcloudSetIE(InfoExtractor):
2806 """Information extractor for soundcloud.com sets
2807 To access the media, the uid of the song and a stream token
2808 must be extracted from the page source and the script must make
2809 a request to media.soundcloud.com/crossdomain.xml. Then
2810 the media can be grabbed by requesting from an url composed
2811 of the stream token and uid
2814 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2815 IE_NAME = u'soundcloud'
2817 def __init__(self, downloader=None):
2818 InfoExtractor.__init__(self, downloader)
2820 def report_resolve(self, video_id):
2821 """Report information extraction."""
2822 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2824 def report_extraction(self, video_id):
2825 """Report information extraction."""
2826 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2828 def _real_extract(self, url):
2829 mobj = re.match(self._VALID_URL, url)
2831 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2834 # extract uploader (which is in the url)
2835 uploader = mobj.group(1)
2836 # extract simple title (uploader + slug of song title)
2837 slug_title = mobj.group(2)
2838 simple_title = uploader + u'-' + slug_title
2840 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2842 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2843 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2844 request = compat_urllib_request.Request(resolv_url)
2846 info_json_bytes = compat_urllib_request.urlopen(request).read()
2847 info_json = info_json_bytes.decode('utf-8')
2848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2849 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2853 info = json.loads(info_json)
2854 if 'errors' in info:
2855 for err in info['errors']:
2856 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2859 for track in info['tracks']:
2860 video_id = track['id']
2861 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2863 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2864 request = compat_urllib_request.Request(streams_url)
2866 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2867 stream_json = stream_json_bytes.decode('utf-8')
2868 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2872 streams = json.loads(stream_json)
2873 mediaURL = streams['http_mp3_128_url']
2878 'uploader': track['user']['username'],
2879 'upload_date': track['created_at'],
2880 'title': track['title'],
2882 'description': track['description'],
2887 class InfoQIE(InfoExtractor):
2888 """Information extractor for infoq.com"""
2889 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2891 def report_extraction(self, video_id):
2892 """Report information extraction."""
2893 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2895 def _real_extract(self, url):
2896 mobj = re.match(self._VALID_URL, url)
2898 self._downloader.report_error(u'invalid URL: %s' % url)
2901 webpage = self._download_webpage(url, video_id=url)
2902 self.report_extraction(url)
2905 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2907 self._downloader.report_error(u'unable to extract video url')
2909 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2910 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2913 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2915 self._downloader.report_error(u'unable to extract video title')
2917 video_title = mobj.group(1)
2919 # Extract description
2920 video_description = u'No description available.'
2921 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2922 if mobj is not None:
2923 video_description = mobj.group(1)
2925 video_filename = video_url.split('/')[-1]
2926 video_id, extension = video_filename.split('.')
2932 'upload_date': None,
2933 'title': video_title,
2934 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2936 'description': video_description,
2941 class MixcloudIE(InfoExtractor):
2942 """Information extractor for www.mixcloud.com"""
2944 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2945 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2946 IE_NAME = u'mixcloud'
2948 def __init__(self, downloader=None):
2949 InfoExtractor.__init__(self, downloader)
2951 def report_download_json(self, file_id):
2952 """Report JSON download."""
2953 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2955 def report_extraction(self, file_id):
2956 """Report information extraction."""
2957 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2959 def get_urls(self, jsonData, fmt, bitrate='best'):
2960 """Get urls from 'audio_formats' section in json"""
2963 bitrate_list = jsonData[fmt]
2964 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2965 bitrate = max(bitrate_list) # select highest
2967 url_list = jsonData[fmt][bitrate]
2968 except TypeError: # we have no bitrate info.
2969 url_list = jsonData[fmt]
2972 def check_urls(self, url_list):
2973 """Returns 1st active url from list"""
2974 for url in url_list:
2976 compat_urllib_request.urlopen(url)
2978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2983 def _print_formats(self, formats):
2984 print('Available formats:')
2985 for fmt in formats.keys():
2986 for b in formats[fmt]:
2988 ext = formats[fmt][b][0]
2989 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2990 except TypeError: # we have no bitrate info
2991 ext = formats[fmt][0]
2992 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2995 def _real_extract(self, url):
2996 mobj = re.match(self._VALID_URL, url)
2998 self._downloader.report_error(u'invalid URL: %s' % url)
3000 # extract uploader & filename from url
3001 uploader = mobj.group(1).decode('utf-8')
3002 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3004 # construct API request
3005 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3006 # retrieve .json file with links to files
3007 request = compat_urllib_request.Request(file_url)
3009 self.report_download_json(file_url)
3010 jsonData = compat_urllib_request.urlopen(request).read()
3011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3012 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3016 json_data = json.loads(jsonData)
3017 player_url = json_data['player_swf_url']
3018 formats = dict(json_data['audio_formats'])
3020 req_format = self._downloader.params.get('format', None)
3023 if self._downloader.params.get('listformats', None):
3024 self._print_formats(formats)
3027 if req_format is None or req_format == 'best':
3028 for format_param in formats.keys():
3029 url_list = self.get_urls(formats, format_param)
3031 file_url = self.check_urls(url_list)
3032 if file_url is not None:
3035 if req_format not in formats:
3036 self._downloader.report_error(u'format is not available')
3039 url_list = self.get_urls(formats, req_format)
3040 file_url = self.check_urls(url_list)
3041 format_param = req_format
3044 'id': file_id.decode('utf-8'),
3045 'url': file_url.decode('utf-8'),
3046 'uploader': uploader.decode('utf-8'),
3047 'upload_date': None,
3048 'title': json_data['name'],
3049 'ext': file_url.split('.')[-1].decode('utf-8'),
3050 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3051 'thumbnail': json_data['thumbnail_url'],
3052 'description': json_data['description'],
3053 'player_url': player_url.decode('utf-8'),
3056 class StanfordOpenClassroomIE(InfoExtractor):
3057 """Information extractor for Stanford's Open ClassRoom"""
3059 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3060 IE_NAME = u'stanfordoc'
3062 def report_download_webpage(self, objid):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3066 def report_extraction(self, video_id):
3067 """Report information extraction."""
3068 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3070 def _real_extract(self, url):
3071 mobj = re.match(self._VALID_URL, url)
3073 raise ExtractorError(u'Invalid URL: %s' % url)
3075 if mobj.group('course') and mobj.group('video'): # A specific video
3076 course = mobj.group('course')
3077 video = mobj.group('video')
3079 'id': course + '_' + video,
3081 'upload_date': None,
3084 self.report_extraction(info['id'])
3085 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3086 xmlUrl = baseUrl + video + '.xml'
3088 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3089 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3090 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3092 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3094 info['title'] = mdoc.findall('./title')[0].text
3095 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3097 self._downloader.report_error(u'Invalid metadata XML file')
3099 info['ext'] = info['url'].rpartition('.')[2]
3101 elif mobj.group('course'): # A course page
3102 course = mobj.group('course')
3107 'upload_date': None,
3110 coursepage = self._download_webpage(url, info['id'],
3111 note='Downloading course info page',
3112 errnote='Unable to download course info page')
3114 m = re.search('<h1>([^<]+)</h1>', coursepage)
3116 info['title'] = unescapeHTML(m.group(1))
3118 info['title'] = info['id']
3120 m = re.search('<description>([^<]+)</description>', coursepage)
3122 info['description'] = unescapeHTML(m.group(1))
3124 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3127 'type': 'reference',
3128 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3132 for entry in info['list']:
3133 assert entry['type'] == 'reference'
3134 results += self.extract(entry['url'])
3138 'id': 'Stanford OpenClassroom',
3141 'upload_date': None,
3144 self.report_download_webpage(info['id'])
3145 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3147 rootpage = compat_urllib_request.urlopen(rootURL).read()
3148 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3149 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3152 info['title'] = info['id']
3154 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3157 'type': 'reference',
3158 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3163 for entry in info['list']:
3164 assert entry['type'] == 'reference'
3165 results += self.extract(entry['url'])
3168 class MTVIE(InfoExtractor):
3169 """Information extractor for MTV.com"""
3171 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3174 def report_extraction(self, video_id):
3175 """Report information extraction."""
3176 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3178 def _real_extract(self, url):
3179 mobj = re.match(self._VALID_URL, url)
3181 self._downloader.report_error(u'invalid URL: %s' % url)
3183 if not mobj.group('proto'):
3184 url = 'http://' + url
3185 video_id = mobj.group('videoid')
3187 webpage = self._download_webpage(url, video_id)
3189 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3191 self._downloader.report_error(u'unable to extract song name')
3193 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3194 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3196 self._downloader.report_error(u'unable to extract performer')
3198 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3199 video_title = performer + ' - ' + song_name
3201 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3203 self._downloader.report_error(u'unable to mtvn_uri')
3205 mtvn_uri = mobj.group(1)
3207 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3209 self._downloader.report_error(u'unable to extract content id')
3211 content_id = mobj.group(1)
3213 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3214 self.report_extraction(video_id)
3215 request = compat_urllib_request.Request(videogen_url)
3217 metadataXml = compat_urllib_request.urlopen(request).read()
3218 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3219 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3222 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3223 renditions = mdoc.findall('.//rendition')
3225 # For now, always pick the highest quality.
3226 rendition = renditions[-1]
3229 _,_,ext = rendition.attrib['type'].partition('/')
3230 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3231 video_url = rendition.find('./src').text
3233 self._downloader.trouble('Invalid rendition field.')
3239 'uploader': performer,
3240 'upload_date': None,
3241 'title': video_title,
3249 class YoukuIE(InfoExtractor):
3250 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3252 def report_download_webpage(self, file_id):
3253 """Report webpage download."""
3254 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3256 def report_extraction(self, file_id):
3257 """Report information extraction."""
3258 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3261 nowTime = int(time.time() * 1000)
3262 random1 = random.randint(1000,1998)
3263 random2 = random.randint(1000,9999)
3265 return "%d%d%d" %(nowTime,random1,random2)
3267 def _get_file_ID_mix_string(self, seed):
3269 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3271 for i in range(len(source)):
3272 seed = (seed * 211 + 30031 ) % 65536
3273 index = math.floor(seed / 65536 * len(source) )
3274 mixed.append(source[int(index)])
3275 source.remove(source[int(index)])
3276 #return ''.join(mixed)
3279 def _get_file_id(self, fileId, seed):
3280 mixed = self._get_file_ID_mix_string(seed)
3281 ids = fileId.split('*')
3285 realId.append(mixed[int(ch)])
3286 return ''.join(realId)
3288 def _real_extract(self, url):
3289 mobj = re.match(self._VALID_URL, url)
3291 self._downloader.report_error(u'invalid URL: %s' % url)
3293 video_id = mobj.group('ID')
3295 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3297 request = compat_urllib_request.Request(info_url, None, std_headers)
3299 self.report_download_webpage(video_id)
3300 jsondata = compat_urllib_request.urlopen(request).read()
3301 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3302 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3305 self.report_extraction(video_id)
3307 jsonstr = jsondata.decode('utf-8')
3308 config = json.loads(jsonstr)
3310 video_title = config['data'][0]['title']
3311 seed = config['data'][0]['seed']
3313 format = self._downloader.params.get('format', None)
3314 supported_format = list(config['data'][0]['streamfileids'].keys())
3316 if format is None or format == 'best':
3317 if 'hd2' in supported_format:
3322 elif format == 'worst':
3330 fileid = config['data'][0]['streamfileids'][format]
3331 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3332 except (UnicodeDecodeError, ValueError, KeyError):
3333 self._downloader.report_error(u'unable to extract info section')
3337 sid = self._gen_sid()
3338 fileid = self._get_file_id(fileid, seed)
3340 #column 8,9 of fileid represent the segment number
3341 #fileid[7:9] should be changed
3342 for index, key in enumerate(keys):
3344 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3345 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3348 'id': '%s_part%02d' % (video_id, index),
3349 'url': download_url,
3351 'upload_date': None,
3352 'title': video_title,
3355 files_info.append(info)
3360 class XNXXIE(InfoExtractor):
3361 """Information extractor for xnxx.com"""
3363 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3365 VIDEO_URL_RE = r'flv_url=(.*?)&'
3366 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3367 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3369 def report_webpage(self, video_id):
3370 """Report information extraction"""
3371 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3373 def report_extraction(self, video_id):
3374 """Report information extraction"""
3375 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3377 def _real_extract(self, url):
3378 mobj = re.match(self._VALID_URL, url)
3380 self._downloader.report_error(u'invalid URL: %s' % url)
3382 video_id = mobj.group(1)
3384 self.report_webpage(video_id)
3386 # Get webpage content
3388 webpage_bytes = compat_urllib_request.urlopen(url).read()
3389 webpage = webpage_bytes.decode('utf-8')
3390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3391 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3394 result = re.search(self.VIDEO_URL_RE, webpage)
3396 self._downloader.report_error(u'unable to extract video url')
3398 video_url = compat_urllib_parse.unquote(result.group(1))
3400 result = re.search(self.VIDEO_TITLE_RE, webpage)
3402 self._downloader.report_error(u'unable to extract video title')
3404 video_title = result.group(1)
3406 result = re.search(self.VIDEO_THUMB_RE, webpage)
3408 self._downloader.report_error(u'unable to extract video thumbnail')
3410 video_thumbnail = result.group(1)
3416 'upload_date': None,
3417 'title': video_title,
3419 'thumbnail': video_thumbnail,
3420 'description': None,
3424 class GooglePlusIE(InfoExtractor):
3425 """Information extractor for plus.google.com."""
3427 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3428 IE_NAME = u'plus.google'
3430 def __init__(self, downloader=None):
3431 InfoExtractor.__init__(self, downloader)
3433 def report_extract_entry(self, url):
3434 """Report downloading extry"""
3435 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3437 def report_date(self, upload_date):
3438 """Report downloading extry"""
3439 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3441 def report_uploader(self, uploader):
3442 """Report downloading extry"""
3443 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3445 def report_title(self, video_title):
3446 """Report downloading extry"""
3447 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3449 def report_extract_vid_page(self, video_page):
3450 """Report information extraction."""
3451 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3453 def _real_extract(self, url):
3454 # Extract id from URL
3455 mobj = re.match(self._VALID_URL, url)
3457 self._downloader.report_error(u'Invalid URL: %s' % url)
3460 post_url = mobj.group(0)
3461 video_id = mobj.group(1)
3463 video_extension = 'flv'
3465 # Step 1, Retrieve post webpage to extract further information
3466 self.report_extract_entry(post_url)
3467 request = compat_urllib_request.Request(post_url)
3469 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3471 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3474 # Extract update date
3476 pattern = 'title="Timestamp">(.*?)</a>'
3477 mobj = re.search(pattern, webpage)
3479 upload_date = mobj.group(1)
3480 # Convert timestring to a format suitable for filename
3481 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3482 upload_date = upload_date.strftime('%Y%m%d')
3483 self.report_date(upload_date)
3487 pattern = r'rel\="author".*?>(.*?)</a>'
3488 mobj = re.search(pattern, webpage)
3490 uploader = mobj.group(1)
3491 self.report_uploader(uploader)
3494 # Get the first line for title
3496 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3497 mobj = re.search(pattern, webpage)
3499 video_title = mobj.group(1)
3500 self.report_title(video_title)
3502 # Step 2, Stimulate clicking the image box to launch video
3503 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3504 mobj = re.search(pattern, webpage)
3506 self._downloader.report_error(u'unable to extract video page URL')
3508 video_page = mobj.group(1)
3509 request = compat_urllib_request.Request(video_page)
3511 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3512 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3513 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3515 self.report_extract_vid_page(video_page)
3518 # Extract video links on video page
3519 """Extract video links of all sizes"""
3520 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3521 mobj = re.findall(pattern, webpage)
3523 self._downloader.report_error(u'unable to extract video links')
3525 # Sort in resolution
3526 links = sorted(mobj)
3528 # Choose the lowest of the sort, i.e. highest resolution
3529 video_url = links[-1]
3530 # Only get the url. The resolution part in the tuple has no use anymore
3531 video_url = video_url[-1]
3532 # Treat escaped \u0026 style hex
3534 video_url = video_url.decode("unicode_escape")
3535 except AttributeError: # Python 3
3536 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3542 'uploader': uploader,
3543 'upload_date': upload_date,
3544 'title': video_title,
3545 'ext': video_extension,
3548 class NBAIE(InfoExtractor):
3549 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3552 def _real_extract(self, url):
3553 mobj = re.match(self._VALID_URL, url)
3555 self._downloader.report_error(u'invalid URL: %s' % url)
3558 video_id = mobj.group(1)
3559 if video_id.endswith('/index.html'):
3560 video_id = video_id[:-len('/index.html')]
3562 webpage = self._download_webpage(url, video_id)
3564 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3565 def _findProp(rexp, default=None):
3566 m = re.search(rexp, webpage)
3568 return unescapeHTML(m.group(1))
3572 shortened_video_id = video_id.rpartition('/')[2]
3573 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3575 'id': shortened_video_id,
3579 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3580 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3584 class JustinTVIE(InfoExtractor):
3585 """Information extractor for justin.tv and twitch.tv"""
3586 # TODO: One broadcast may be split into multiple videos. The key
3587 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3588 # starts at 1 and increases. Can we treat all parts as one video?
3590 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3591 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3592 _JUSTIN_PAGE_LIMIT = 100
3593 IE_NAME = u'justin.tv'
3595 def report_extraction(self, file_id):
3596 """Report information extraction."""
3597 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3599 def report_download_page(self, channel, offset):
3600 """Report attempt to download a single page of videos."""
3601 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3602 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3604 # Return count of items, list of *valid* items
3605 def _parse_page(self, url):
3607 urlh = compat_urllib_request.urlopen(url)
3608 webpage_bytes = urlh.read()
3609 webpage = webpage_bytes.decode('utf-8', 'ignore')
3610 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3611 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3614 response = json.loads(webpage)
3615 if type(response) != list:
3616 error_text = response.get('error', 'unknown error')
3617 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3620 for clip in response:
3621 video_url = clip['video_file_url']
3623 video_extension = os.path.splitext(video_url)[1][1:]
3624 video_date = re.sub('-', '', clip['start_time'][:10])
3625 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3626 video_id = clip['id']
3627 video_title = clip.get('title', video_id)
3631 'title': video_title,
3632 'uploader': clip.get('channel_name', video_uploader_id),
3633 'uploader_id': video_uploader_id,
3634 'upload_date': video_date,
3635 'ext': video_extension,
3637 return (len(response), info)
3639 def _real_extract(self, url):
3640 mobj = re.match(self._VALID_URL, url)
3642 self._downloader.report_error(u'invalid URL: %s' % url)
3645 api = 'http://api.justin.tv'
3646 video_id = mobj.group(mobj.lastindex)
3648 if mobj.lastindex == 1:
3650 api += '/channel/archives/%s.json'
3652 api += '/broadcast/by_archive/%s.json'
3653 api = api % (video_id,)
3655 self.report_extraction(video_id)
3659 limit = self._JUSTIN_PAGE_LIMIT
3662 self.report_download_page(video_id, offset)
3663 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3664 page_count, page_info = self._parse_page(page_url)
3665 info.extend(page_info)
3666 if not paged or page_count != limit:
3671 class FunnyOrDieIE(InfoExtractor):
3672 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3674 def _real_extract(self, url):
3675 mobj = re.match(self._VALID_URL, url)
3677 self._downloader.report_error(u'invalid URL: %s' % url)
3680 video_id = mobj.group('id')
3681 webpage = self._download_webpage(url, video_id)
3683 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3685 self._downloader.report_error(u'unable to find video information')
3686 video_url = unescapeHTML(m.group('url'))
3688 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3690 self._downloader.trouble(u'Cannot find video title')
3691 title = clean_html(m.group('title'))
3693 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3695 desc = unescapeHTML(m.group('desc'))
3704 'description': desc,
3708 class SteamIE(InfoExtractor):
3709 _VALID_URL = r"""http://store.steampowered.com/
3710 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3712 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3716 def suitable(cls, url):
3717 """Receives a URL and returns True if suitable for this IE."""
3718 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3720 def _real_extract(self, url):
3721 m = re.match(self._VALID_URL, url, re.VERBOSE)
3722 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3723 gameID = m.group('gameID')
3724 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3725 webpage = self._download_webpage(videourl, gameID)
3726 mweb = re.finditer(urlRE, webpage)
3727 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3728 titles = re.finditer(namesRE, webpage)
3729 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3730 thumbs = re.finditer(thumbsRE, webpage)
3732 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3733 video_id = vid.group('videoID')
3734 title = vtitle.group('videoName')
3735 video_url = vid.group('videoURL')
3736 video_thumb = thumb.group('thumbnail')
3738 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3743 'title': unescapeHTML(title),
3744 'thumbnail': video_thumb
3749 class UstreamIE(InfoExtractor):
3750 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3751 IE_NAME = u'ustream'
3753 def _real_extract(self, url):
3754 m = re.match(self._VALID_URL, url)
3755 video_id = m.group('videoID')
3756 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3757 webpage = self._download_webpage(url, video_id)
3758 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3759 title = m.group('title')
3760 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3761 uploader = m.group('uploader')
3767 'uploader': uploader
3771 class WorldStarHipHopIE(InfoExtractor):
3772 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3773 IE_NAME = u'WorldStarHipHop'
3775 def _real_extract(self, url):
3776 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3778 webpage_src = compat_urllib_request.urlopen(url).read()
3779 webpage_src = webpage_src.decode('utf-8')
3781 mobj = re.search(_src_url, webpage_src)
3783 m = re.match(self._VALID_URL, url)
3784 video_id = m.group('id')
3786 if mobj is not None:
3787 video_url = mobj.group()
3788 if 'mp4' in video_url:
3793 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3796 _title = r"""<title>(.*)</title>"""
3798 mobj = re.search(_title, webpage_src)
3800 if mobj is not None:
3801 title = mobj.group(1)
3803 title = 'World Start Hip Hop - %s' % time.ctime()
3805 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3806 mobj = re.search(_thumbnail, webpage_src)
3808 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3809 if mobj is not None:
3810 thumbnail = mobj.group(1)
3812 _title = r"""candytitles.*>(.*)</span>"""
3813 mobj = re.search(_title, webpage_src)
3814 if mobj is not None:
3815 title = mobj.group(1)
3822 'thumbnail' : thumbnail,
3827 class RBMARadioIE(InfoExtractor):
3828 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3830 def _real_extract(self, url):
3831 m = re.match(self._VALID_URL, url)
3832 video_id = m.group('videoID')
3834 webpage = self._download_webpage(url, video_id)
3835 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3837 raise ExtractorError(u'Cannot find metadata')
3838 json_data = m.group(1)
3841 data = json.loads(json_data)
3842 except ValueError as e:
3843 raise ExtractorError(u'Invalid JSON: ' + str(e))
3845 video_url = data['akamai_url'] + '&cbr=256'
3846 url_parts = compat_urllib_parse_urlparse(video_url)
3847 video_ext = url_parts.path.rpartition('.')[2]
3852 'title': data['title'],
3853 'description': data.get('teaser_text'),
3854 'location': data.get('country_of_origin'),
3855 'uploader': data.get('host', {}).get('name'),
3856 'uploader_id': data.get('host', {}).get('slug'),
3857 'thumbnail': data.get('image', {}).get('large_url_2x'),
3858 'duration': data.get('duration'),
3863 class YouPornIE(InfoExtractor):
3864 """Information extractor for youporn.com."""
3865 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3867 def _print_formats(self, formats):
3868 """Print all available formats"""
3869 print(u'Available formats:')
3870 print(u'ext\t\tformat')
3871 print(u'---------------------------------')
3872 for format in formats:
3873 print(u'%s\t\t%s' % (format['ext'], format['format']))
3875 def _specific(self, req_format, formats):
3877 if(x["format"]==req_format):
3881 def _real_extract(self, url):
3882 mobj = re.match(self._VALID_URL, url)
3884 self._downloader.report_error(u'invalid URL: %s' % url)
3887 video_id = mobj.group('videoid')
3889 req = compat_urllib_request.Request(url)
3890 req.add_header('Cookie', 'age_verified=1')
3891 webpage = self._download_webpage(req, video_id)
3893 # Get the video title
3894 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3896 raise ExtractorError(u'Unable to extract video title')
3897 video_title = result.group('title').strip()
3899 # Get the video date
3900 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3902 self._downloader.report_warning(u'unable to extract video date')
3905 upload_date = result.group('date').strip()
3907 # Get the video uploader
3908 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3910 self._downloader.report_warning(u'unable to extract uploader')
3911 video_uploader = None
3913 video_uploader = result.group('uploader').strip()
3914 video_uploader = clean_html( video_uploader )
3916 # Get all of the formats available
3917 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3918 result = re.search(DOWNLOAD_LIST_RE, webpage)
3920 raise ExtractorError(u'Unable to extract download list')
3921 download_list_html = result.group('download_list').strip()
3923 # Get all of the links from the page
3924 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3925 links = re.findall(LINK_RE, download_list_html)
3926 if(len(links) == 0):
3927 raise ExtractorError(u'ERROR: no known formats available for video')
3929 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3934 # A link looks like this:
3935 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3936 # A path looks like this:
3937 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3938 video_url = unescapeHTML( link )
3939 path = compat_urllib_parse_urlparse( video_url ).path
3940 extension = os.path.splitext( path )[1][1:]
3941 format = path.split('/')[4].split('_')[:2]
3944 format = "-".join( format )
3945 title = u'%s-%s-%s' % (video_title, size, bitrate)
3950 'uploader': video_uploader,
3951 'upload_date': upload_date,
3956 'description': None,
3960 if self._downloader.params.get('listformats', None):
3961 self._print_formats(formats)
3964 req_format = self._downloader.params.get('format', None)
3965 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3967 if req_format is None or req_format == 'best':
3969 elif req_format == 'worst':
3970 return [formats[-1]]
3971 elif req_format in ('-1', 'all'):
3974 format = self._specific( req_format, formats )
3976 self._downloader.report_error(u'requested format not available')
3982 class PornotubeIE(InfoExtractor):
3983 """Information extractor for pornotube.com."""
3984 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3986 def _real_extract(self, url):
3987 mobj = re.match(self._VALID_URL, url)
3989 self._downloader.report_error(u'invalid URL: %s' % url)
3992 video_id = mobj.group('videoid')
3993 video_title = mobj.group('title')
3995 # Get webpage content
3996 webpage = self._download_webpage(url, video_id)
3999 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4000 result = re.search(VIDEO_URL_RE, webpage)
4002 self._downloader.report_error(u'unable to extract video url')
4004 video_url = compat_urllib_parse.unquote(result.group('url'))
4006 #Get the uploaded date
4007 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4008 result = re.search(VIDEO_UPLOADED_RE, webpage)
4010 self._downloader.report_error(u'unable to extract video title')
4012 upload_date = result.group('date')
4014 info = {'id': video_id,
4017 'upload_date': upload_date,
4018 'title': video_title,
4024 class YouJizzIE(InfoExtractor):
4025 """Information extractor for youjizz.com."""
4026 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4028 def _real_extract(self, url):
4029 mobj = re.match(self._VALID_URL, url)
4031 self._downloader.report_error(u'invalid URL: %s' % url)
4034 video_id = mobj.group('videoid')
4036 # Get webpage content
4037 webpage = self._download_webpage(url, video_id)
4039 # Get the video title
4040 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4042 raise ExtractorError(u'ERROR: unable to extract video title')
4043 video_title = result.group('title').strip()
4045 # Get the embed page
4046 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4048 raise ExtractorError(u'ERROR: unable to extract embed page')
4050 embed_page_url = result.group(0).strip()
4051 video_id = result.group('videoid')
4053 webpage = self._download_webpage(embed_page_url, video_id)
4056 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4058 raise ExtractorError(u'ERROR: unable to extract video url')
4059 video_url = result.group('source')
4061 info = {'id': video_id,
4063 'title': video_title,
4066 'player_url': embed_page_url}
4070 class EightTracksIE(InfoExtractor):
4072 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4074 def _real_extract(self, url):
4075 mobj = re.match(self._VALID_URL, url)
4077 raise ExtractorError(u'Invalid URL: %s' % url)
4078 playlist_id = mobj.group('id')
4080 webpage = self._download_webpage(url, playlist_id)
4082 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4084 raise ExtractorError(u'Cannot find trax information')
4085 json_like = m.group(1)
4086 data = json.loads(json_like)
4088 session = str(random.randint(0, 1000000000))
4090 track_count = data['tracks_count']
4091 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4092 next_url = first_url
4094 for i in itertools.count():
4095 api_json = self._download_webpage(next_url, playlist_id,
4096 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4097 errnote=u'Failed to download song information')
4098 api_data = json.loads(api_json)
4099 track_data = api_data[u'set']['track']
4101 'id': track_data['id'],
4102 'url': track_data['track_file_stream_url'],
4103 'title': track_data['performer'] + u' - ' + track_data['name'],
4104 'raw_title': track_data['name'],
4105 'uploader_id': data['user']['login'],
4109 if api_data['set']['at_last_track']:
4111 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4114 class KeekIE(InfoExtractor):
4115 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4118 def _real_extract(self, url):
4119 m = re.match(self._VALID_URL, url)
4120 video_id = m.group('videoID')
4121 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4122 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4123 webpage = self._download_webpage(url, video_id)
4124 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4125 title = unescapeHTML(m.group('title'))
4126 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4127 uploader = clean_html(m.group('uploader'))
4133 'thumbnail': thumbnail,
4134 'uploader': uploader
4138 class TEDIE(InfoExtractor):
4139 _VALID_URL=r'''http://www.ted.com/
4141 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4143 ((?P<type_talk>talks)) # We have a simple talk
4145 /(?P<name>\w+) # Here goes the name and then ".html"
4149 def suitable(cls, url):
4150 """Receives a URL and returns True if suitable for this IE."""
4151 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4153 def _real_extract(self, url):
4154 m=re.match(self._VALID_URL, url, re.VERBOSE)
4155 if m.group('type_talk'):
4156 return [self._talk_info(url)]
4158 playlist_id=m.group('playlist_id')
4159 name=m.group('name')
4160 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4161 return self._playlist_videos_info(url,name,playlist_id)
4163 def _talk_video_link(self,mediaSlug):
4164 '''Returns the video link for that mediaSlug'''
4165 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4167 def _playlist_videos_info(self,url,name,playlist_id=0):
4168 '''Returns the videos of the playlist'''
4170 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4171 ([.\s]*?)data-playlist_item_id="(\d+)"
4172 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4174 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4175 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4176 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4177 m_names=re.finditer(video_name_RE,webpage)
4179 for m_video, m_name in zip(m_videos,m_names):
4180 video_id=m_video.group('video_id')
4181 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4182 info.append(self._talk_info(talk_url,video_id))
4185 def _talk_info(self, url, video_id=0):
4186 """Return the video for the talk in the url"""
4187 m=re.match(self._VALID_URL, url,re.VERBOSE)
4188 videoName=m.group('name')
4189 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4190 # If the url includes the language we get the title translated
4191 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4192 title=re.search(title_RE, webpage).group('title')
4193 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4194 "id":(?P<videoID>[\d]+).*?
4195 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4196 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4197 thumb_match=re.search(thumb_RE,webpage)
4198 info_match=re.search(info_RE,webpage,re.VERBOSE)
4199 video_id=info_match.group('videoID')
4200 mediaSlug=info_match.group('mediaSlug')
4201 video_url=self._talk_video_link(mediaSlug)
4207 'thumbnail': thumb_match.group('thumbnail')
4211 class MySpassIE(InfoExtractor):
4212 _VALID_URL = r'http://www.myspass.de/.*'
4214 def _real_extract(self, url):
4215 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4217 # video id is the last path element of the URL
4218 # usually there is a trailing slash, so also try the second but last
4219 url_path = compat_urllib_parse_urlparse(url).path
4220 url_parent_path, video_id = os.path.split(url_path)
4222 _, video_id = os.path.split(url_parent_path)
4225 metadata_url = META_DATA_URL_TEMPLATE % video_id
4226 metadata_text = self._download_webpage(metadata_url, video_id)
4227 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4229 # extract values from metadata
4230 url_flv_el = metadata.find('url_flv')
4231 if url_flv_el is None:
4232 self._downloader.report_error(u'unable to extract download url')
4234 video_url = url_flv_el.text
4235 extension = os.path.splitext(video_url)[1][1:]
4236 title_el = metadata.find('title')
4237 if title_el is None:
4238 self._downloader.report_error(u'unable to extract title')
4240 title = title_el.text
4241 format_id_el = metadata.find('format_id')
4242 if format_id_el is None:
4245 format = format_id_el.text
4246 description_el = metadata.find('description')
4247 if description_el is not None:
4248 description = description_el.text
4251 imagePreview_el = metadata.find('imagePreview')
4252 if imagePreview_el is not None:
4253 thumbnail = imagePreview_el.text
4262 'thumbnail': thumbnail,
4263 'description': description
4267 class SpiegelIE(InfoExtractor):
4268 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4270 def _real_extract(self, url):
4271 m = re.match(self._VALID_URL, url)
4272 video_id = m.group('videoID')
4274 webpage = self._download_webpage(url, video_id)
4275 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4277 raise ExtractorError(u'Cannot find title')
4278 video_title = unescapeHTML(m.group(1))
4280 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4281 xml_code = self._download_webpage(xml_url, video_id,
4282 note=u'Downloading XML', errnote=u'Failed to download XML')
4284 idoc = xml.etree.ElementTree.fromstring(xml_code)
4285 last_type = idoc[-1]
4286 filename = last_type.findall('./filename')[0].text
4287 duration = float(last_type.findall('./duration')[0].text)
4289 video_url = 'http://video2.spiegel.de/flash/' + filename
4290 video_ext = filename.rpartition('.')[2]
4295 'title': video_title,
4296 'duration': duration,
4300 class LiveLeakIE(InfoExtractor):
4302 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4303 IE_NAME = u'liveleak'
4305 def _real_extract(self, url):
4306 mobj = re.match(self._VALID_URL, url)
4308 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4311 video_id = mobj.group('video_id')
4313 webpage = self._download_webpage(url, video_id)
4315 m = re.search(r'file: "(.*?)",', webpage)
4317 self._downloader.report_error(u'unable to find video url')
4319 video_url = m.group(1)
4321 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4323 self._downloader.trouble(u'Cannot find video title')
4324 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4326 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4328 desc = unescapeHTML(m.group('desc'))
4332 m = re.search(r'By:.*?(\w+)</a>', webpage)
4334 uploader = clean_html(m.group(1))
4343 'description': desc,
4344 'uploader': uploader
4350 def gen_extractors():
4351 """ Return a list of an instance of every supported extractor.
4352 The order does matter; the first extractor matched is the one handling the URL.
4355 YoutubePlaylistIE(),
4380 StanfordOpenClassroomIE(),
4390 WorldStarHipHopIE(),