2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 content_type = urlh.headers.get('Content-Type', '')
130 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 encoding = m.group(1)
135 webpage_bytes = urlh.read()
136 return webpage_bytes.decode(encoding, 'replace')
139 class YoutubeIE(InfoExtractor):
140 """Information extractor for youtube.com."""
144 (?:https?://)? # http(s):// (optional)
145 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
147 (?:.*?\#/)? # handle anchor (#/) redirect urls
148 (?: # the various things that can precede the ID:
149 (?:(?:v|embed|e)/) # v/ or embed/ or e/
150 |(?: # or the v= param in all its forms
151 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
152 (?:\?|\#!?) # the params delimiter ? or # or #!
153 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
156 )? # optional -> youtube.com/xxxx is OK
157 )? # all until now is optional -> you can pass the naked ID
158 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
159 (?(1).+)? # if we found the ID, everything can follow
161 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
162 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
163 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 _NETRC_MACHINE = 'youtube'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
168 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
169 _video_extensions = {
175 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
181 _video_dimensions = {
200 def suitable(cls, url):
201 """Receives a URL and returns True if suitable for this IE."""
202 if YoutubePlaylistIE.suitable(url): return False
203 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
205 def report_lang(self):
206 """Report attempt to set language."""
207 self._downloader.to_screen(u'[youtube] Setting language')
209 def report_login(self):
210 """Report attempt to log in."""
211 self._downloader.to_screen(u'[youtube] Logging in')
213 def report_age_confirmation(self):
214 """Report attempt to confirm age."""
215 self._downloader.to_screen(u'[youtube] Confirming age')
217 def report_video_webpage_download(self, video_id):
218 """Report attempt to download video webpage."""
219 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
221 def report_video_info_webpage_download(self, video_id):
222 """Report attempt to download video info webpage."""
223 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
225 def report_video_subtitles_download(self, video_id):
226 """Report attempt to download video info webpage."""
227 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
229 def report_information_extraction(self, video_id):
230 """Report attempt to extract video information."""
231 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
233 def report_unavailable_format(self, video_id, format):
234 """Report extracted video URL."""
235 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
237 def report_rtmp_download(self):
238 """Indicate the download will use the RTMP protocol."""
239 self._downloader.to_screen(u'[youtube] RTMP download detected')
241 def _closed_captions_xml_to_srt(self, xml_string):
243 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
244 # TODO parse xml instead of regex
245 for n, (start, dur_tag, dur, caption) in enumerate(texts):
246 if not dur: dur = '4'
248 end = start + float(dur)
249 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
250 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
251 caption = unescapeHTML(caption)
252 caption = unescapeHTML(caption) # double cycle, intentional
253 srt += str(n+1) + '\n'
254 srt += start + ' --> ' + end + '\n'
255 srt += caption + '\n\n'
258 def _extract_subtitles(self, video_id):
259 self.report_video_subtitles_download(video_id)
260 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
262 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
266 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
267 if not srt_lang_list:
268 return (u'WARNING: video has no closed captions', None)
269 if self._downloader.params.get('subtitleslang', False):
270 srt_lang = self._downloader.params.get('subtitleslang')
271 elif 'en' in srt_lang_list:
274 srt_lang = list(srt_lang_list.keys())[0]
275 if not srt_lang in srt_lang_list:
276 return (u'WARNING: no closed captions found in the specified language', None)
277 params = compat_urllib_parse.urlencode({
279 'name': srt_lang_list[srt_lang].encode('utf-8'),
282 url = 'http://www.youtube.com/api/timedtext?' + params
284 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
285 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
286 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
288 return (u'WARNING: Did not fetch video subtitles', None)
289 return (None, self._closed_captions_xml_to_srt(srt_xml))
291 def _print_formats(self, formats):
292 print('Available formats:')
294 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
296 def _real_initialize(self):
297 if self._downloader is None:
302 downloader_params = self._downloader.params
304 # Attempt to use provided username and password or .netrc data
305 if downloader_params.get('username', None) is not None:
306 username = downloader_params['username']
307 password = downloader_params['password']
308 elif downloader_params.get('usenetrc', False):
310 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
315 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
316 except (IOError, netrc.NetrcParseError) as err:
317 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
321 request = compat_urllib_request.Request(self._LANG_URL)
324 compat_urllib_request.urlopen(request).read()
325 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
326 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
329 # No authentication to be performed
333 request = compat_urllib_request.Request(self._LOGIN_URL)
335 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
336 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
337 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
342 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
344 galx = match.group(1)
346 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
352 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
356 u'PersistentCookie': u'yes',
358 u'bgresponse': u'js_disabled',
359 u'checkConnection': u'',
360 u'checkedDomains': u'youtube',
366 u'signIn': u'Sign in',
368 u'service': u'youtube',
372 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
374 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
375 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
376 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
379 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
380 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
381 self._downloader.report_warning(u'unable to log in: bad username or password')
383 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
384 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
390 'action_confirm': 'Confirm',
392 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
394 self.report_age_confirmation()
395 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
400 def _extract_id(self, url):
401 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
403 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
405 video_id = mobj.group(2)
408 def _real_extract(self, url):
409 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
410 mobj = re.search(self._NEXT_URL_RE, url)
412 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
413 video_id = self._extract_id(url)
416 self.report_video_webpage_download(video_id)
417 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
418 request = compat_urllib_request.Request(url)
420 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
425 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
427 # Attempt to extract SWF player URL
428 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
430 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
435 self.report_video_info_webpage_download(video_id)
436 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
437 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
438 % (video_id, el_type))
439 request = compat_urllib_request.Request(video_info_url)
441 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
442 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
443 video_info = compat_parse_qs(video_info_webpage)
444 if 'token' in video_info:
446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
447 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
449 if 'token' not in video_info:
450 if 'reason' in video_info:
451 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
453 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
456 # Check for "rental" videos
457 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
458 self._downloader.trouble(u'ERROR: "rental" videos not supported')
461 # Start extracting information
462 self.report_information_extraction(video_id)
465 if 'author' not in video_info:
466 self._downloader.trouble(u'ERROR: unable to extract uploader name')
468 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
471 video_uploader_id = None
472 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
474 video_uploader_id = mobj.group(1)
476 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
479 if 'title' not in video_info:
480 self._downloader.trouble(u'ERROR: unable to extract video title')
482 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
485 if 'thumbnail_url' not in video_info:
486 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
488 else: # don't panic if we can't find it
489 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
493 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
495 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
496 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
497 for expression in format_expressions:
499 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
504 video_description = get_element_by_id("eow-description", video_webpage)
505 if video_description:
506 video_description = clean_html(video_description)
508 video_description = ''
511 video_subtitles = None
512 if self._downloader.params.get('writesubtitles', False):
513 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
515 self._downloader.trouble(srt_error)
517 if 'length_seconds' not in video_info:
518 self._downloader.trouble(u'WARNING: unable to extract video duration')
521 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
524 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
526 # Decide which formats to download
527 req_format = self._downloader.params.get('format', None)
529 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
530 self.report_rtmp_download()
531 video_url_list = [(None, video_info['conn'][0])]
532 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
533 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
534 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
535 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
536 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
538 format_limit = self._downloader.params.get('format_limit', None)
539 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
540 if format_limit is not None and format_limit in available_formats:
541 format_list = available_formats[available_formats.index(format_limit):]
543 format_list = available_formats
544 existing_formats = [x for x in format_list if x in url_map]
545 if len(existing_formats) == 0:
546 self._downloader.trouble(u'ERROR: no known formats available for video')
548 if self._downloader.params.get('listformats', None):
549 self._print_formats(existing_formats)
551 if req_format is None or req_format == 'best':
552 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
553 elif req_format == 'worst':
554 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
555 elif req_format in ('-1', 'all'):
556 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
558 # Specific formats. We pick the first in a slash-delimeted sequence.
559 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
560 req_formats = req_format.split('/')
561 video_url_list = None
562 for rf in req_formats:
564 video_url_list = [(rf, url_map[rf])]
566 if video_url_list is None:
567 self._downloader.trouble(u'ERROR: requested format not available')
570 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
574 for format_param, video_real_url in video_url_list:
576 video_extension = self._video_extensions.get(format_param, 'flv')
578 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
579 self._video_dimensions.get(format_param, '???'))
583 'url': video_real_url,
584 'uploader': video_uploader,
585 'uploader_id': video_uploader_id,
586 'upload_date': upload_date,
587 'title': video_title,
588 'ext': video_extension,
589 'format': video_format,
590 'thumbnail': video_thumbnail,
591 'description': video_description,
592 'player_url': player_url,
593 'subtitles': video_subtitles,
594 'duration': video_duration
599 class MetacafeIE(InfoExtractor):
600 """Information Extractor for metacafe.com."""
602 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
603 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
604 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
605 IE_NAME = u'metacafe'
607 def __init__(self, downloader=None):
608 InfoExtractor.__init__(self, downloader)
610 def report_disclaimer(self):
611 """Report disclaimer retrieval."""
612 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
614 def report_age_confirmation(self):
615 """Report attempt to confirm age."""
616 self._downloader.to_screen(u'[metacafe] Confirming age')
618 def report_download_webpage(self, video_id):
619 """Report webpage download."""
620 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
622 def report_extraction(self, video_id):
623 """Report information extraction."""
624 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
626 def _real_initialize(self):
627 # Retrieve disclaimer
628 request = compat_urllib_request.Request(self._DISCLAIMER)
630 self.report_disclaimer()
631 disclaimer = compat_urllib_request.urlopen(request).read()
632 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
633 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
639 'submit': "Continue - I'm over 18",
641 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
643 self.report_age_confirmation()
644 disclaimer = compat_urllib_request.urlopen(request).read()
645 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
646 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
649 def _real_extract(self, url):
650 # Extract id and simplified title from URL
651 mobj = re.match(self._VALID_URL, url)
653 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
656 video_id = mobj.group(1)
658 # Check if video comes from YouTube
659 mobj2 = re.match(r'^yt-(.*)$', video_id)
660 if mobj2 is not None:
661 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
667 self.report_download_webpage(video_id)
668 webpage = compat_urllib_request.urlopen(request).read()
669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
670 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
673 # Extract URL, uploader and title from webpage
674 self.report_extraction(video_id)
675 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
677 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
678 video_extension = mediaURL[-3:]
680 # Extract gdaKey if available
681 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
685 gdaKey = mobj.group(1)
686 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
688 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
690 self._downloader.trouble(u'ERROR: unable to extract media URL')
692 vardict = compat_parse_qs(mobj.group(1))
693 if 'mediaData' not in vardict:
694 self._downloader.trouble(u'ERROR: unable to extract media URL')
696 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
698 self._downloader.trouble(u'ERROR: unable to extract media URL')
700 mediaURL = mobj.group(1).replace('\\/', '/')
701 video_extension = mediaURL[-3:]
702 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
704 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
706 self._downloader.trouble(u'ERROR: unable to extract title')
708 video_title = mobj.group(1).decode('utf-8')
710 mobj = re.search(r'submitter=(.*?);', webpage)
712 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
714 video_uploader = mobj.group(1)
717 'id': video_id.decode('utf-8'),
718 'url': video_url.decode('utf-8'),
719 'uploader': video_uploader.decode('utf-8'),
721 'title': video_title,
722 'ext': video_extension.decode('utf-8'),
726 class DailymotionIE(InfoExtractor):
727 """Information Extractor for Dailymotion"""
729 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
730 IE_NAME = u'dailymotion'
733 def __init__(self, downloader=None):
734 InfoExtractor.__init__(self, downloader)
736 def report_extraction(self, video_id):
737 """Report information extraction."""
738 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
740 def _real_extract(self, url):
741 # Extract id and simplified title from URL
742 mobj = re.match(self._VALID_URL, url)
744 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
747 video_id = mobj.group(1).split('_')[0].split('?')[0]
749 video_extension = 'mp4'
751 # Retrieve video webpage to extract further information
752 request = compat_urllib_request.Request(url)
753 request.add_header('Cookie', 'family_filter=off')
754 webpage = self._download_webpage(request, video_id)
756 # Extract URL, uploader and title from webpage
757 self.report_extraction(video_id)
758 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
760 self._downloader.trouble(u'ERROR: unable to extract media URL')
762 flashvars = compat_urllib_parse.unquote(mobj.group(1))
764 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
767 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
770 self._downloader.trouble(u'ERROR: unable to extract video URL')
773 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
775 self._downloader.trouble(u'ERROR: unable to extract video URL')
778 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
780 # TODO: support choosing qualities
782 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
784 self._downloader.trouble(u'ERROR: unable to extract title')
786 video_title = unescapeHTML(mobj.group('title'))
788 video_uploader = None
789 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
791 # lookin for official user
792 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
793 if mobj_official is None:
794 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
796 video_uploader = mobj_official.group(1)
798 video_uploader = mobj.group(1)
800 video_upload_date = None
801 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
803 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
808 'uploader': video_uploader,
809 'upload_date': video_upload_date,
810 'title': video_title,
811 'ext': video_extension,
815 class PhotobucketIE(InfoExtractor):
816 """Information extractor for photobucket.com."""
818 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
819 IE_NAME = u'photobucket'
821 def __init__(self, downloader=None):
822 InfoExtractor.__init__(self, downloader)
824 def report_download_webpage(self, video_id):
825 """Report webpage download."""
826 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
828 def report_extraction(self, video_id):
829 """Report information extraction."""
830 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
832 def _real_extract(self, url):
833 # Extract id from URL
834 mobj = re.match(self._VALID_URL, url)
836 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
839 video_id = mobj.group(1)
841 video_extension = 'flv'
843 # Retrieve video webpage to extract further information
844 request = compat_urllib_request.Request(url)
846 self.report_download_webpage(video_id)
847 webpage = compat_urllib_request.urlopen(request).read()
848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
849 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
852 # Extract URL, uploader, and title from webpage
853 self.report_extraction(video_id)
854 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
856 self._downloader.trouble(u'ERROR: unable to extract media URL')
858 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
862 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
864 self._downloader.trouble(u'ERROR: unable to extract title')
866 video_title = mobj.group(1).decode('utf-8')
868 video_uploader = mobj.group(2).decode('utf-8')
871 'id': video_id.decode('utf-8'),
872 'url': video_url.decode('utf-8'),
873 'uploader': video_uploader,
875 'title': video_title,
876 'ext': video_extension.decode('utf-8'),
880 class YahooIE(InfoExtractor):
881 """Information extractor for video.yahoo.com."""
884 # _VALID_URL matches all Yahoo! Video URLs
885 # _VPAGE_URL matches only the extractable '/watch/' URLs
886 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
887 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
888 IE_NAME = u'video.yahoo'
890 def __init__(self, downloader=None):
891 InfoExtractor.__init__(self, downloader)
893 def report_download_webpage(self, video_id):
894 """Report webpage download."""
895 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
897 def report_extraction(self, video_id):
898 """Report information extraction."""
899 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
901 def _real_extract(self, url, new_video=True):
902 # Extract ID from URL
903 mobj = re.match(self._VALID_URL, url)
905 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
908 video_id = mobj.group(2)
909 video_extension = 'flv'
911 # Rewrite valid but non-extractable URLs as
912 # extractable English language /watch/ URLs
913 if re.match(self._VPAGE_URL, url) is None:
914 request = compat_urllib_request.Request(url)
916 webpage = compat_urllib_request.urlopen(request).read()
917 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
921 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
923 self._downloader.trouble(u'ERROR: Unable to extract id field')
925 yahoo_id = mobj.group(1)
927 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
929 self._downloader.trouble(u'ERROR: Unable to extract vid field')
931 yahoo_vid = mobj.group(1)
933 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
934 return self._real_extract(url, new_video=False)
936 # Retrieve video webpage to extract further information
937 request = compat_urllib_request.Request(url)
939 self.report_download_webpage(video_id)
940 webpage = compat_urllib_request.urlopen(request).read()
941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
945 # Extract uploader and title from webpage
946 self.report_extraction(video_id)
947 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
949 self._downloader.trouble(u'ERROR: unable to extract video title')
951 video_title = mobj.group(1).decode('utf-8')
953 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
955 self._downloader.trouble(u'ERROR: unable to extract video uploader')
957 video_uploader = mobj.group(1).decode('utf-8')
959 # Extract video thumbnail
960 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
962 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
964 video_thumbnail = mobj.group(1).decode('utf-8')
966 # Extract video description
967 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
969 self._downloader.trouble(u'ERROR: unable to extract video description')
971 video_description = mobj.group(1).decode('utf-8')
972 if not video_description:
973 video_description = 'No description available.'
975 # Extract video height and width
976 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
978 self._downloader.trouble(u'ERROR: unable to extract video height')
980 yv_video_height = mobj.group(1)
982 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
984 self._downloader.trouble(u'ERROR: unable to extract video width')
986 yv_video_width = mobj.group(1)
988 # Retrieve video playlist to extract media URL
989 # I'm not completely sure what all these options are, but we
990 # seem to need most of them, otherwise the server sends a 401.
991 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
992 yv_bitrate = '700' # according to Wikipedia this is hard-coded
993 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
994 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
995 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
997 self.report_download_webpage(video_id)
998 webpage = compat_urllib_request.urlopen(request).read()
999 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1000 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1003 # Extract media URL from playlist XML
1004 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1006 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1008 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1009 video_url = unescapeHTML(video_url)
1012 'id': video_id.decode('utf-8'),
1014 'uploader': video_uploader,
1015 'upload_date': None,
1016 'title': video_title,
1017 'ext': video_extension.decode('utf-8'),
1018 'thumbnail': video_thumbnail.decode('utf-8'),
1019 'description': video_description,
1023 class VimeoIE(InfoExtractor):
1024 """Information extractor for vimeo.com."""
1026 # _VALID_URL matches Vimeo URLs
1027 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1030 def __init__(self, downloader=None):
1031 InfoExtractor.__init__(self, downloader)
1033 def report_download_webpage(self, video_id):
1034 """Report webpage download."""
1035 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1037 def report_extraction(self, video_id):
1038 """Report information extraction."""
1039 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1041 def _real_extract(self, url, new_video=True):
1042 # Extract ID from URL
1043 mobj = re.match(self._VALID_URL, url)
1045 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1048 video_id = mobj.group('id')
1049 if not mobj.group('proto'):
1050 url = 'https://' + url
1051 if mobj.group('direct_link'):
1052 url = 'https://vimeo.com/' + video_id
1054 # Retrieve video webpage to extract further information
1055 request = compat_urllib_request.Request(url, None, std_headers)
1057 self.report_download_webpage(video_id)
1058 webpage_bytes = compat_urllib_request.urlopen(request).read()
1059 webpage = webpage_bytes.decode('utf-8')
1060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1061 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1064 # Now we begin extracting as much information as we can from what we
1065 # retrieved. First we extract the information common to all extractors,
1066 # and latter we extract those that are Vimeo specific.
1067 self.report_extraction(video_id)
1069 # Extract the config JSON
1071 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1072 config = json.loads(config)
1074 self._downloader.trouble(u'ERROR: unable to extract info section')
1078 video_title = config["video"]["title"]
1080 # Extract uploader and uploader_id
1081 video_uploader = config["video"]["owner"]["name"]
1082 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1084 # Extract video thumbnail
1085 video_thumbnail = config["video"]["thumbnail"]
1087 # Extract video description
1088 video_description = get_element_by_attribute("itemprop", "description", webpage)
1089 if video_description: video_description = clean_html(video_description)
1090 else: video_description = ''
1092 # Extract upload date
1093 video_upload_date = None
1094 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1095 if mobj is not None:
1096 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1098 # Vimeo specific: extract request signature and timestamp
1099 sig = config['request']['signature']
1100 timestamp = config['request']['timestamp']
1102 # Vimeo specific: extract video codec and quality information
1103 # First consider quality, then codecs, then take everything
1104 # TODO bind to format param
1105 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1106 files = { 'hd': [], 'sd': [], 'other': []}
1107 for codec_name, codec_extension in codecs:
1108 if codec_name in config["video"]["files"]:
1109 if 'hd' in config["video"]["files"][codec_name]:
1110 files['hd'].append((codec_name, codec_extension, 'hd'))
1111 elif 'sd' in config["video"]["files"][codec_name]:
1112 files['sd'].append((codec_name, codec_extension, 'sd'))
1114 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1116 for quality in ('hd', 'sd', 'other'):
1117 if len(files[quality]) > 0:
1118 video_quality = files[quality][0][2]
1119 video_codec = files[quality][0][0]
1120 video_extension = files[quality][0][1]
1121 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1124 self._downloader.trouble(u'ERROR: no known codec found')
1127 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1128 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1133 'uploader': video_uploader,
1134 'uploader_id': video_uploader_id,
1135 'upload_date': video_upload_date,
1136 'title': video_title,
1137 'ext': video_extension,
1138 'thumbnail': video_thumbnail,
1139 'description': video_description,
1143 class ArteTvIE(InfoExtractor):
1144 """arte.tv information extractor."""
1146 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1147 _LIVE_URL = r'index-[0-9]+\.html$'
1149 IE_NAME = u'arte.tv'
1151 def __init__(self, downloader=None):
1152 InfoExtractor.__init__(self, downloader)
1154 def report_download_webpage(self, video_id):
1155 """Report webpage download."""
1156 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1158 def report_extraction(self, video_id):
1159 """Report information extraction."""
1160 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1162 def fetch_webpage(self, url):
1163 request = compat_urllib_request.Request(url)
1165 self.report_download_webpage(url)
1166 webpage = compat_urllib_request.urlopen(request).read()
1167 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1168 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1170 except ValueError as err:
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1176 page = self.fetch_webpage(url)
1177 mobj = re.search(regex, page, regexFlags)
1181 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184 for (i, key, err) in matchTuples:
1185 if mobj.group(i) is None:
1186 self._downloader.trouble(err)
1189 info[key] = mobj.group(i)
1193 def extractLiveStream(self, url):
1194 video_lang = url.split('/')[-4]
1195 info = self.grep_webpage(
1197 r'src="(.*?/videothek_js.*?\.js)',
1200 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1203 http_host = url.split('/')[2]
1204 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1205 info = self.grep_webpage(
1207 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1208 '(http://.*?\.swf).*?' +
1212 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1213 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1214 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1217 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1219 def extractPlus7Stream(self, url):
1220 video_lang = url.split('/')[-3]
1221 info = self.grep_webpage(
1223 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1226 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1229 next_url = compat_urllib_parse.unquote(info.get('url'))
1230 info = self.grep_webpage(
1232 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1235 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1238 next_url = compat_urllib_parse.unquote(info.get('url'))
1240 info = self.grep_webpage(
1242 r'<video id="(.*?)".*?>.*?' +
1243 '<name>(.*?)</name>.*?' +
1244 '<dateVideo>(.*?)</dateVideo>.*?' +
1245 '<url quality="hd">(.*?)</url>',
1248 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1249 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1250 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1251 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1256 'id': info.get('id'),
1257 'url': compat_urllib_parse.unquote(info.get('url')),
1258 'uploader': u'arte.tv',
1259 'upload_date': info.get('date'),
1260 'title': info.get('title').decode('utf-8'),
1266 def _real_extract(self, url):
1267 video_id = url.split('/')[-1]
1268 self.report_extraction(video_id)
1270 if re.search(self._LIVE_URL, video_id) is not None:
1271 self.extractLiveStream(url)
1274 info = self.extractPlus7Stream(url)
1279 class GenericIE(InfoExtractor):
1280 """Generic last-resort information extractor."""
1283 IE_NAME = u'generic'
1285 def __init__(self, downloader=None):
1286 InfoExtractor.__init__(self, downloader)
1288 def report_download_webpage(self, video_id):
1289 """Report webpage download."""
1290 if not self._downloader.params.get('test', False):
1291 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1292 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1294 def report_extraction(self, video_id):
1295 """Report information extraction."""
1296 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1298 def report_following_redirect(self, new_url):
1299 """Report information extraction."""
1300 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1302 def _test_redirect(self, url):
1303 """Check if it is a redirect, like url shorteners, in case restart chain."""
1304 class HeadRequest(compat_urllib_request.Request):
1305 def get_method(self):
1308 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1310 Subclass the HTTPRedirectHandler to make it use our
1311 HeadRequest also on the redirected URL
1313 def redirect_request(self, req, fp, code, msg, headers, newurl):
1314 if code in (301, 302, 303, 307):
1315 newurl = newurl.replace(' ', '%20')
1316 newheaders = dict((k,v) for k,v in req.headers.items()
1317 if k.lower() not in ("content-length", "content-type"))
1318 return HeadRequest(newurl,
1320 origin_req_host=req.get_origin_req_host(),
1323 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1325 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1327 Fallback to GET if HEAD is not allowed (405 HTTP error)
1329 def http_error_405(self, req, fp, code, msg, headers):
1333 newheaders = dict((k,v) for k,v in req.headers.items()
1334 if k.lower() not in ("content-length", "content-type"))
1335 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1337 origin_req_host=req.get_origin_req_host(),
1341 opener = compat_urllib_request.OpenerDirector()
1342 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1343 HTTPMethodFallback, HEADRedirectHandler,
1344 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1345 opener.add_handler(handler())
1347 response = opener.open(HeadRequest(url))
1348 new_url = response.geturl()
1353 self.report_following_redirect(new_url)
1354 self._downloader.download([new_url])
1357 def _real_extract(self, url):
1358 if self._test_redirect(url): return
1360 video_id = url.split('/')[-1]
1362 webpage = self._download_webpage(url, video_id)
1363 except ValueError as err:
1364 # since this is the last-resort InfoExtractor, if
1365 # this error is thrown, it'll be thrown here
1366 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1369 self.report_extraction(video_id)
1370 # Start with something easy: JW Player in SWFObject
1371 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1373 # Broaden the search a little bit
1374 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1376 # Broaden the search a little bit: JWPlayer JS loader
1377 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 # It's possible that one of the regexes
1383 # matched, but returned an empty group:
1384 if mobj.group(1) is None:
1385 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1388 video_url = compat_urllib_parse.unquote(mobj.group(1))
1389 video_id = os.path.basename(video_url)
1391 # here's a fun little line of code for you:
1392 video_extension = os.path.splitext(video_id)[1][1:]
1393 video_id = os.path.splitext(video_id)[0]
1395 # it's tempting to parse this further, but you would
1396 # have to take into account all the variations like
1397 # Video Title - Site Name
1398 # Site Name | Video Title
1399 # Video Title - Tagline | Site Name
1400 # and so on and so forth; it's just not practical
1401 mobj = re.search(r'<title>(.*)</title>', webpage)
1403 self._downloader.trouble(u'ERROR: unable to extract title')
1405 video_title = mobj.group(1)
1407 # video uploader is domain name
1408 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1410 self._downloader.trouble(u'ERROR: unable to extract title')
1412 video_uploader = mobj.group(1)
1417 'uploader': video_uploader,
1418 'upload_date': None,
1419 'title': video_title,
1420 'ext': video_extension,
1424 class YoutubeSearchIE(InfoExtractor):
1425 """Information Extractor for YouTube search queries."""
1426 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1427 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1428 _max_youtube_results = 1000
1429 IE_NAME = u'youtube:search'
1431 def __init__(self, downloader=None):
1432 InfoExtractor.__init__(self, downloader)
1434 def report_download_page(self, query, pagenum):
1435 """Report attempt to download search page with given number."""
1436 query = query.decode(preferredencoding())
1437 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1439 def _real_extract(self, query):
1440 mobj = re.match(self._VALID_URL, query)
1442 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1445 prefix, query = query.split(':')
1447 query = query.encode('utf-8')
1449 self._download_n_results(query, 1)
1451 elif prefix == 'all':
1452 self._download_n_results(query, self._max_youtube_results)
1458 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1460 elif n > self._max_youtube_results:
1461 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1462 n = self._max_youtube_results
1463 self._download_n_results(query, n)
1465 except ValueError: # parsing prefix as integer fails
1466 self._download_n_results(query, 1)
1469 def _download_n_results(self, query, n):
1470 """Downloads a specified number of results for a query"""
1476 while (50 * pagenum) < limit:
1477 self.report_download_page(query, pagenum+1)
1478 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479 request = compat_urllib_request.Request(result_url)
1481 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1485 api_response = json.loads(data)['data']
1487 if not 'items' in api_response:
1488 self._downloader.trouble(u'[youtube] No video results')
1491 new_ids = list(video['id'] for video in api_response['items'])
1492 video_ids += new_ids
1494 limit = min(n, api_response['totalItems'])
1497 if len(video_ids) > n:
1498 video_ids = video_ids[:n]
1499 for id in video_ids:
1500 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1504 class GoogleSearchIE(InfoExtractor):
1505 """Information Extractor for Google Video search queries."""
1506 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1507 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1508 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1509 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1510 _max_google_results = 1000
1511 IE_NAME = u'video.google:search'
1513 def __init__(self, downloader=None):
1514 InfoExtractor.__init__(self, downloader)
1516 def report_download_page(self, query, pagenum):
1517 """Report attempt to download playlist page with given number."""
1518 query = query.decode(preferredencoding())
1519 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1521 def _real_extract(self, query):
1522 mobj = re.match(self._VALID_URL, query)
1524 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1527 prefix, query = query.split(':')
1529 query = query.encode('utf-8')
1531 self._download_n_results(query, 1)
1533 elif prefix == 'all':
1534 self._download_n_results(query, self._max_google_results)
1540 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1542 elif n > self._max_google_results:
1543 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1544 n = self._max_google_results
1545 self._download_n_results(query, n)
1547 except ValueError: # parsing prefix as integer fails
1548 self._download_n_results(query, 1)
1551 def _download_n_results(self, query, n):
1552 """Downloads a specified number of results for a query"""
1558 self.report_download_page(query, pagenum)
1559 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1560 request = compat_urllib_request.Request(result_url)
1562 page = compat_urllib_request.urlopen(request).read()
1563 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1564 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1567 # Extract video identifiers
1568 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1569 video_id = mobj.group(1)
1570 if video_id not in video_ids:
1571 video_ids.append(video_id)
1572 if len(video_ids) == n:
1573 # Specified n videos reached
1574 for id in video_ids:
1575 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1578 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1579 for id in video_ids:
1580 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1583 pagenum = pagenum + 1
1586 class YahooSearchIE(InfoExtractor):
1587 """Information Extractor for Yahoo! Video search queries."""
1590 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1591 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1592 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1593 _MORE_PAGES_INDICATOR = r'\s*Next'
1594 _max_yahoo_results = 1000
1595 IE_NAME = u'video.yahoo:search'
1597 def __init__(self, downloader=None):
1598 InfoExtractor.__init__(self, downloader)
1600 def report_download_page(self, query, pagenum):
1601 """Report attempt to download playlist page with given number."""
1602 query = query.decode(preferredencoding())
1603 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1605 def _real_extract(self, query):
1606 mobj = re.match(self._VALID_URL, query)
1608 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1611 prefix, query = query.split(':')
1613 query = query.encode('utf-8')
1615 self._download_n_results(query, 1)
1617 elif prefix == 'all':
1618 self._download_n_results(query, self._max_yahoo_results)
1624 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1626 elif n > self._max_yahoo_results:
1627 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1628 n = self._max_yahoo_results
1629 self._download_n_results(query, n)
1631 except ValueError: # parsing prefix as integer fails
1632 self._download_n_results(query, 1)
1635 def _download_n_results(self, query, n):
1636 """Downloads a specified number of results for a query"""
1639 already_seen = set()
1643 self.report_download_page(query, pagenum)
1644 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1645 request = compat_urllib_request.Request(result_url)
1647 page = compat_urllib_request.urlopen(request).read()
1648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1652 # Extract video identifiers
1653 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1654 video_id = mobj.group(1)
1655 if video_id not in already_seen:
1656 video_ids.append(video_id)
1657 already_seen.add(video_id)
1658 if len(video_ids) == n:
1659 # Specified n videos reached
1660 for id in video_ids:
1661 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1665 for id in video_ids:
1666 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1669 pagenum = pagenum + 1
1672 class YoutubePlaylistIE(InfoExtractor):
1673 """Information Extractor for YouTube playlists."""
1675 _VALID_URL = r"""(?:
1680 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1681 \? (?:.*?&)*? (?:p|a|list)=
1686 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1689 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1691 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1693 IE_NAME = u'youtube:playlist'
1695 def __init__(self, downloader=None):
1696 InfoExtractor.__init__(self, downloader)
1699 def suitable(cls, url):
1700 """Receives a URL and returns True if suitable for this IE."""
1701 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1703 def report_download_page(self, playlist_id, pagenum):
1704 """Report attempt to download playlist page with given number."""
1705 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1707 def _real_extract(self, url):
1708 # Extract playlist id
1709 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1711 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1714 # Download playlist videos from API
1715 playlist_id = mobj.group(1) or mobj.group(2)
1720 self.report_download_page(playlist_id, page_num)
1722 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1724 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1725 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1726 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1730 response = json.loads(page)
1731 except ValueError as err:
1732 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1735 if not 'feed' in response or not 'entry' in response['feed']:
1736 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1738 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1739 for entry in response['feed']['entry']
1740 if 'content' in entry ]
1742 if len(response['feed']['entry']) < self._MAX_RESULTS:
1746 videos = [v[1] for v in sorted(videos)]
1749 playliststart = self._downloader.params.get('playliststart', 1) - 1
1750 playlistend = self._downloader.params.get('playlistend', -1)
1751 if playlistend == -1:
1752 videos = videos[playliststart:]
1754 videos = videos[playliststart:playlistend]
1756 if len(videos) == total:
1757 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1759 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1761 for video in videos:
1762 self._downloader.download([video])
1766 class YoutubeChannelIE(InfoExtractor):
1767 """Information Extractor for YouTube channels."""
1769 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1770 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1771 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1772 IE_NAME = u'youtube:channel'
1774 def report_download_page(self, channel_id, pagenum):
1775 """Report attempt to download channel page with given number."""
1776 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1778 def _real_extract(self, url):
1779 # Extract channel id
1780 mobj = re.match(self._VALID_URL, url)
1782 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1785 # Download channel pages
1786 channel_id = mobj.group(1)
1791 self.report_download_page(channel_id, pagenum)
1792 url = self._TEMPLATE_URL % (channel_id, pagenum)
1793 request = compat_urllib_request.Request(url)
1795 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1797 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1800 # Extract video identifiers
1802 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1803 if mobj.group(1) not in ids_in_page:
1804 ids_in_page.append(mobj.group(1))
1805 video_ids.extend(ids_in_page)
1807 if self._MORE_PAGES_INDICATOR not in page:
1809 pagenum = pagenum + 1
1811 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1813 for id in video_ids:
1814 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1818 class YoutubeUserIE(InfoExtractor):
1819 """Information Extractor for YouTube users."""
1821 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1822 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1823 _GDATA_PAGE_SIZE = 50
1824 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1825 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1826 IE_NAME = u'youtube:user'
1828 def __init__(self, downloader=None):
1829 InfoExtractor.__init__(self, downloader)
1831 def report_download_page(self, username, start_index):
1832 """Report attempt to download user page."""
1833 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1834 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1836 def _real_extract(self, url):
1838 mobj = re.match(self._VALID_URL, url)
1840 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1843 username = mobj.group(1)
1845 # Download video ids using YouTube Data API. Result size per
1846 # query is limited (currently to 50 videos) so we need to query
1847 # page by page until there are no video ids - it means we got
1854 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1855 self.report_download_page(username, start_index)
1857 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1860 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1865 # Extract video identifiers
1868 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1869 if mobj.group(1) not in ids_in_page:
1870 ids_in_page.append(mobj.group(1))
1872 video_ids.extend(ids_in_page)
1874 # A little optimization - if current page is not
1875 # "full", ie. does not contain PAGE_SIZE video ids then
1876 # we can assume that this page is the last one - there
1877 # are no more ids on further pages - no need to query
1880 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1885 all_ids_count = len(video_ids)
1886 playliststart = self._downloader.params.get('playliststart', 1) - 1
1887 playlistend = self._downloader.params.get('playlistend', -1)
1889 if playlistend == -1:
1890 video_ids = video_ids[playliststart:]
1892 video_ids = video_ids[playliststart:playlistend]
1894 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1895 (username, all_ids_count, len(video_ids)))
1897 for video_id in video_ids:
1898 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1901 class BlipTVUserIE(InfoExtractor):
1902 """Information Extractor for blip.tv users."""
1904 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1906 IE_NAME = u'blip.tv:user'
1908 def __init__(self, downloader=None):
1909 InfoExtractor.__init__(self, downloader)
1911 def report_download_page(self, username, pagenum):
1912 """Report attempt to download user page."""
1913 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1914 (self.IE_NAME, username, pagenum))
1916 def _real_extract(self, url):
1918 mobj = re.match(self._VALID_URL, url)
1920 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1923 username = mobj.group(1)
1925 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1927 request = compat_urllib_request.Request(url)
1930 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931 mobj = re.search(r'data-users-id="([^"]+)"', page)
1932 page_base = page_base % mobj.group(1)
1933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1934 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1938 # Download video ids using BlipTV Ajax calls. Result size per
1939 # query is limited (currently to 12 videos) so we need to query
1940 # page by page until there are no video ids - it means we got
1947 self.report_download_page(username, pagenum)
1948 url = page_base + "&page=" + str(pagenum)
1949 request = compat_urllib_request.Request( url )
1951 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1953 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1956 # Extract video identifiers
1959 for mobj in re.finditer(r'href="/([^"]+)"', page):
1960 if mobj.group(1) not in ids_in_page:
1961 ids_in_page.append(unescapeHTML(mobj.group(1)))
1963 video_ids.extend(ids_in_page)
1965 # A little optimization - if current page is not
1966 # "full", ie. does not contain PAGE_SIZE video ids then
1967 # we can assume that this page is the last one - there
1968 # are no more ids on further pages - no need to query
1971 if len(ids_in_page) < self._PAGE_SIZE:
1976 all_ids_count = len(video_ids)
1977 playliststart = self._downloader.params.get('playliststart', 1) - 1
1978 playlistend = self._downloader.params.get('playlistend', -1)
1980 if playlistend == -1:
1981 video_ids = video_ids[playliststart:]
1983 video_ids = video_ids[playliststart:playlistend]
1985 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1986 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1988 for video_id in video_ids:
1989 self._downloader.download([u'http://blip.tv/'+video_id])
1992 class DepositFilesIE(InfoExtractor):
1993 """Information extractor for depositfiles.com"""
1995 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1997 def report_download_webpage(self, file_id):
1998 """Report webpage download."""
1999 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2001 def report_extraction(self, file_id):
2002 """Report information extraction."""
2003 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2005 def _real_extract(self, url):
2006 file_id = url.split('/')[-1]
2007 # Rebuild url in english locale
2008 url = 'http://depositfiles.com/en/files/' + file_id
2010 # Retrieve file webpage with 'Free download' button pressed
2011 free_download_indication = { 'gateway_result' : '1' }
2012 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2014 self.report_download_webpage(file_id)
2015 webpage = compat_urllib_request.urlopen(request).read()
2016 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2017 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2020 # Search for the real file URL
2021 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2022 if (mobj is None) or (mobj.group(1) is None):
2023 # Try to figure out reason of the error.
2024 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2025 if (mobj is not None) and (mobj.group(1) is not None):
2026 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2027 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2029 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2032 file_url = mobj.group(1)
2033 file_extension = os.path.splitext(file_url)[1][1:]
2035 # Search for file title
2036 mobj = re.search(r'<b title="(.*?)">', webpage)
2038 self._downloader.trouble(u'ERROR: unable to extract title')
2040 file_title = mobj.group(1).decode('utf-8')
2043 'id': file_id.decode('utf-8'),
2044 'url': file_url.decode('utf-8'),
2046 'upload_date': None,
2047 'title': file_title,
2048 'ext': file_extension.decode('utf-8'),
2052 class FacebookIE(InfoExtractor):
2053 """Information Extractor for Facebook"""
2055 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2056 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2057 _NETRC_MACHINE = 'facebook'
2058 IE_NAME = u'facebook'
2060 def report_login(self):
2061 """Report attempt to log in."""
2062 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2064 def _real_initialize(self):
2065 if self._downloader is None:
2070 downloader_params = self._downloader.params
2072 # Attempt to use provided username and password or .netrc data
2073 if downloader_params.get('username', None) is not None:
2074 useremail = downloader_params['username']
2075 password = downloader_params['password']
2076 elif downloader_params.get('usenetrc', False):
2078 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2079 if info is not None:
2083 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2084 except (IOError, netrc.NetrcParseError) as err:
2085 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2088 if useremail is None:
2097 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2100 login_results = compat_urllib_request.urlopen(request).read()
2101 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2102 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2104 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2105 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2108 def _real_extract(self, url):
2109 mobj = re.match(self._VALID_URL, url)
2111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2113 video_id = mobj.group('ID')
2115 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2116 webpage = self._download_webpage(url, video_id)
2118 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2119 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2120 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2122 raise ExtractorError(u'Cannot parse data')
2123 data = dict(json.loads(m.group(1)))
2124 params_raw = compat_urllib_parse.unquote(data['params'])
2125 params = json.loads(params_raw)
2126 video_url = params['hd_src']
2128 video_url = params['sd_src']
2130 raise ExtractorError(u'Cannot find video URL')
2131 video_duration = int(params['video_duration'])
2133 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2135 raise ExtractorError(u'Cannot find title in webpage')
2136 video_title = unescapeHTML(m.group(1))
2140 'title': video_title,
2143 'duration': video_duration,
2144 'thumbnail': params['thumbnail_src'],
2149 class BlipTVIE(InfoExtractor):
2150 """Information extractor for blip.tv"""
2152 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2153 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2154 IE_NAME = u'blip.tv'
2156 def report_extraction(self, file_id):
2157 """Report information extraction."""
2158 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2160 def report_direct_download(self, title):
2161 """Report information extraction."""
2162 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2164 def _real_extract(self, url):
2165 mobj = re.match(self._VALID_URL, url)
2167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2170 urlp = compat_urllib_parse_urlparse(url)
2171 if urlp.path.startswith('/play/'):
2172 request = compat_urllib_request.Request(url)
2173 response = compat_urllib_request.urlopen(request)
2174 redirecturl = response.geturl()
2175 rurlp = compat_urllib_parse_urlparse(redirecturl)
2176 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2177 url = 'http://blip.tv/a/a-' + file_id
2178 return self._real_extract(url)
2185 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2186 request = compat_urllib_request.Request(json_url)
2187 request.add_header('User-Agent', 'iTunes/10.6.1')
2188 self.report_extraction(mobj.group(1))
2191 urlh = compat_urllib_request.urlopen(request)
2192 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2193 basename = url.split('/')[-1]
2194 title,ext = os.path.splitext(basename)
2195 title = title.decode('UTF-8')
2196 ext = ext.replace('.', '')
2197 self.report_direct_download(title)
2202 'upload_date': None,
2207 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2208 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2209 if info is None: # Regular URL
2211 json_code_bytes = urlh.read()
2212 json_code = json_code_bytes.decode('utf-8')
2213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2218 json_data = json.loads(json_code)
2219 if 'Post' in json_data:
2220 data = json_data['Post']
2224 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225 video_url = data['media']['url']
2226 umobj = re.match(self._URL_EXT, video_url)
2228 raise ValueError('Can not determine filename extension')
2229 ext = umobj.group(1)
2232 'id': data['item_id'],
2234 'uploader': data['display_name'],
2235 'upload_date': upload_date,
2236 'title': data['title'],
2238 'format': data['media']['mimeType'],
2239 'thumbnail': data['thumbnailUrl'],
2240 'description': data['description'],
2241 'player_url': data['embedUrl'],
2242 'user_agent': 'iTunes/10.6.1',
2244 except (ValueError,KeyError) as err:
2245 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2251 class MyVideoIE(InfoExtractor):
2252 """Information Extractor for myvideo.de."""
2254 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255 IE_NAME = u'myvideo'
2257 def __init__(self, downloader=None):
2258 InfoExtractor.__init__(self, downloader)
2260 def report_extraction(self, video_id):
2261 """Report information extraction."""
2262 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2264 def _real_extract(self,url):
2265 mobj = re.match(self._VALID_URL, url)
2267 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2270 video_id = mobj.group(1)
2273 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2274 webpage = self._download_webpage(webpage_url, video_id)
2276 self.report_extraction(video_id)
2277 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2280 self._downloader.trouble(u'ERROR: unable to extract media URL')
2282 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2284 mobj = re.search('<title>([^<]+)</title>', webpage)
2286 self._downloader.trouble(u'ERROR: unable to extract title')
2289 video_title = mobj.group(1)
2295 'upload_date': None,
2296 'title': video_title,
2300 class ComedyCentralIE(InfoExtractor):
2301 """Information extractor for The Daily Show and Colbert Report """
2303 # urls can be abbreviations like :thedailyshow or :colbert
2304 # urls for episodes like:
2305 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2306 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2307 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2308 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2309 |(https?://)?(www\.)?
2310 (?P<showname>thedailyshow|colbertnation)\.com/
2311 (full-episodes/(?P<episode>.*)|
2313 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2314 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2317 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2319 _video_extensions = {
2327 _video_dimensions = {
2337 def suitable(cls, url):
2338 """Receives a URL and returns True if suitable for this IE."""
2339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2341 def report_extraction(self, episode_id):
2342 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2344 def report_config_download(self, episode_id, media_id):
2345 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2347 def report_index_download(self, episode_id):
2348 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2350 def _print_formats(self, formats):
2351 print('Available formats:')
2353 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2356 def _real_extract(self, url):
2357 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2359 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2362 if mobj.group('shortname'):
2363 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2364 url = u'http://www.thedailyshow.com/full-episodes/'
2366 url = u'http://www.colbertnation.com/full-episodes/'
2367 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2368 assert mobj is not None
2370 if mobj.group('clip'):
2371 if mobj.group('showname') == 'thedailyshow':
2372 epTitle = mobj.group('tdstitle')
2374 epTitle = mobj.group('cntitle')
2377 dlNewest = not mobj.group('episode')
2379 epTitle = mobj.group('showname')
2381 epTitle = mobj.group('episode')
2383 req = compat_urllib_request.Request(url)
2384 self.report_extraction(epTitle)
2386 htmlHandle = compat_urllib_request.urlopen(req)
2387 html = htmlHandle.read()
2388 webpage = html.decode('utf-8')
2389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2390 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2393 url = htmlHandle.geturl()
2394 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2398 if mobj.group('episode') == '':
2399 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2401 epTitle = mobj.group('episode')
2403 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2405 if len(mMovieParams) == 0:
2406 # The Colbert Report embeds the information in a without
2407 # a URL prefix; so extract the alternate reference
2408 # and then add the URL prefix manually.
2410 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2411 if len(altMovieParams) == 0:
2412 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2415 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2417 uri = mMovieParams[0][1]
2418 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2419 self.report_index_download(epTitle)
2421 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2428 idoc = xml.etree.ElementTree.fromstring(indexXml)
2429 itemEls = idoc.findall('.//item')
2430 for partNum,itemEl in enumerate(itemEls):
2431 mediaId = itemEl.findall('./guid')[0].text
2432 shortMediaId = mediaId.split(':')[-1]
2433 showId = mediaId.split(':')[-2].replace('.com', '')
2434 officialTitle = itemEl.findall('./title')[0].text
2435 officialDate = itemEl.findall('./pubDate')[0].text
2437 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2438 compat_urllib_parse.urlencode({'uri': mediaId}))
2439 configReq = compat_urllib_request.Request(configUrl)
2440 self.report_config_download(epTitle, shortMediaId)
2442 configXml = compat_urllib_request.urlopen(configReq).read()
2443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2447 cdoc = xml.etree.ElementTree.fromstring(configXml)
2449 for rendition in cdoc.findall('.//rendition'):
2450 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2454 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2457 if self._downloader.params.get('listformats', None):
2458 self._print_formats([i[0] for i in turls])
2461 # For now, just pick the highest bitrate
2462 format,rtmp_video_url = turls[-1]
2464 # Get the format arg from the arg stream
2465 req_format = self._downloader.params.get('format', None)
2467 # Select format if we can find one
2470 format, rtmp_video_url = f, v
2473 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2475 raise ExtractorError(u'Cannot transform RTMP url')
2476 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2477 video_url = base + m.group('finalid')
2479 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2484 'upload_date': officialDate,
2489 'description': officialTitle,
2491 results.append(info)
2496 class EscapistIE(InfoExtractor):
2497 """Information extractor for The Escapist """
2499 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2500 IE_NAME = u'escapist'
2502 def report_extraction(self, showName):
2503 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2505 def report_config_download(self, showName):
2506 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2508 def _real_extract(self, url):
2509 mobj = re.match(self._VALID_URL, url)
2511 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2513 showName = mobj.group('showname')
2514 videoId = mobj.group('episode')
2516 self.report_extraction(showName)
2518 webPage = compat_urllib_request.urlopen(url)
2519 webPageBytes = webPage.read()
2520 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2521 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2523 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2526 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2527 description = unescapeHTML(descMatch.group(1))
2528 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2529 imgUrl = unescapeHTML(imgMatch.group(1))
2530 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2531 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2532 configUrlMatch = re.search('config=(.*)$', playerUrl)
2533 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2535 self.report_config_download(showName)
2537 configJSON = compat_urllib_request.urlopen(configUrl)
2538 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2539 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2540 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2541 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2544 # Technically, it's JavaScript, not JSON
2545 configJSON = configJSON.replace("'", '"')
2548 config = json.loads(configJSON)
2549 except (ValueError,) as err:
2550 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2553 playlist = config['playlist']
2554 videoUrl = playlist[1]['url']
2559 'uploader': showName,
2560 'upload_date': None,
2563 'thumbnail': imgUrl,
2564 'description': description,
2565 'player_url': playerUrl,
2570 class CollegeHumorIE(InfoExtractor):
2571 """Information extractor for collegehumor.com"""
2574 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2575 IE_NAME = u'collegehumor'
2577 def report_manifest(self, video_id):
2578 """Report information extraction."""
2579 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2581 def report_extraction(self, video_id):
2582 """Report information extraction."""
2583 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2585 def _real_extract(self, url):
2586 mobj = re.match(self._VALID_URL, url)
2588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2590 video_id = mobj.group('videoid')
2595 'upload_date': None,
2598 self.report_extraction(video_id)
2599 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2601 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2602 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2603 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2606 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2608 videoNode = mdoc.findall('./video')[0]
2609 info['description'] = videoNode.findall('./description')[0].text
2610 info['title'] = videoNode.findall('./caption')[0].text
2611 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2612 manifest_url = videoNode.findall('./file')[0].text
2614 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2617 manifest_url += '?hdcore=2.10.3'
2618 self.report_manifest(video_id)
2620 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2621 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2622 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2625 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2627 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2628 node_id = media_node.attrib['url']
2629 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2630 except IndexError as err:
2631 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2634 url_pr = compat_urllib_parse_urlparse(manifest_url)
2635 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2642 class XVideosIE(InfoExtractor):
2643 """Information extractor for xvideos.com"""
2645 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2646 IE_NAME = u'xvideos'
2648 def report_extraction(self, video_id):
2649 """Report information extraction."""
2650 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2652 def _real_extract(self, url):
2653 mobj = re.match(self._VALID_URL, url)
2655 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2657 video_id = mobj.group(1)
2659 webpage = self._download_webpage(url, video_id)
2661 self.report_extraction(video_id)
2665 mobj = re.search(r'flv_url=(.+?)&', webpage)
2667 self._downloader.trouble(u'ERROR: unable to extract video url')
2669 video_url = compat_urllib_parse.unquote(mobj.group(1))
2673 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2675 self._downloader.trouble(u'ERROR: unable to extract video title')
2677 video_title = mobj.group(1)
2680 # Extract video thumbnail
2681 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2683 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2685 video_thumbnail = mobj.group(0)
2691 'upload_date': None,
2692 'title': video_title,
2694 'thumbnail': video_thumbnail,
2695 'description': None,
2701 class SoundcloudIE(InfoExtractor):
2702 """Information extractor for soundcloud.com
2703 To access the media, the uid of the song and a stream token
2704 must be extracted from the page source and the script must make
2705 a request to media.soundcloud.com/crossdomain.xml. Then
2706 the media can be grabbed by requesting from an url composed
2707 of the stream token and uid
2710 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2711 IE_NAME = u'soundcloud'
2713 def __init__(self, downloader=None):
2714 InfoExtractor.__init__(self, downloader)
2716 def report_resolve(self, video_id):
2717 """Report information extraction."""
2718 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2720 def report_extraction(self, video_id):
2721 """Report information extraction."""
2722 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2724 def _real_extract(self, url):
2725 mobj = re.match(self._VALID_URL, url)
2727 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2730 # extract uploader (which is in the url)
2731 uploader = mobj.group(1)
2732 # extract simple title (uploader + slug of song title)
2733 slug_title = mobj.group(2)
2734 simple_title = uploader + u'-' + slug_title
2736 self.report_resolve('%s/%s' % (uploader, slug_title))
2738 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2739 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2740 request = compat_urllib_request.Request(resolv_url)
2742 info_json_bytes = compat_urllib_request.urlopen(request).read()
2743 info_json = info_json_bytes.decode('utf-8')
2744 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2745 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2748 info = json.loads(info_json)
2749 video_id = info['id']
2750 self.report_extraction('%s/%s' % (uploader, slug_title))
2752 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2753 request = compat_urllib_request.Request(streams_url)
2755 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2756 stream_json = stream_json_bytes.decode('utf-8')
2757 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2758 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2761 streams = json.loads(stream_json)
2762 mediaURL = streams['http_mp3_128_url']
2767 'uploader': info['user']['username'],
2768 'upload_date': info['created_at'],
2769 'title': info['title'],
2771 'description': info['description'],
2775 class InfoQIE(InfoExtractor):
2776 """Information extractor for infoq.com"""
2777 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2779 def report_extraction(self, video_id):
2780 """Report information extraction."""
2781 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2783 def _real_extract(self, url):
2784 mobj = re.match(self._VALID_URL, url)
2786 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2789 webpage = self._download_webpage(url, video_id=url)
2790 self.report_extraction(url)
2793 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2795 self._downloader.trouble(u'ERROR: unable to extract video url')
2797 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2798 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2801 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2803 self._downloader.trouble(u'ERROR: unable to extract video title')
2805 video_title = mobj.group(1)
2807 # Extract description
2808 video_description = u'No description available.'
2809 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2810 if mobj is not None:
2811 video_description = mobj.group(1)
2813 video_filename = video_url.split('/')[-1]
2814 video_id, extension = video_filename.split('.')
2820 'upload_date': None,
2821 'title': video_title,
2822 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2824 'description': video_description,
2829 class MixcloudIE(InfoExtractor):
2830 """Information extractor for www.mixcloud.com"""
2832 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2833 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2834 IE_NAME = u'mixcloud'
2836 def __init__(self, downloader=None):
2837 InfoExtractor.__init__(self, downloader)
2839 def report_download_json(self, file_id):
2840 """Report JSON download."""
2841 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2843 def report_extraction(self, file_id):
2844 """Report information extraction."""
2845 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2847 def get_urls(self, jsonData, fmt, bitrate='best'):
2848 """Get urls from 'audio_formats' section in json"""
2851 bitrate_list = jsonData[fmt]
2852 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2853 bitrate = max(bitrate_list) # select highest
2855 url_list = jsonData[fmt][bitrate]
2856 except TypeError: # we have no bitrate info.
2857 url_list = jsonData[fmt]
2860 def check_urls(self, url_list):
2861 """Returns 1st active url from list"""
2862 for url in url_list:
2864 compat_urllib_request.urlopen(url)
2866 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2871 def _print_formats(self, formats):
2872 print('Available formats:')
2873 for fmt in formats.keys():
2874 for b in formats[fmt]:
2876 ext = formats[fmt][b][0]
2877 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2878 except TypeError: # we have no bitrate info
2879 ext = formats[fmt][0]
2880 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2883 def _real_extract(self, url):
2884 mobj = re.match(self._VALID_URL, url)
2886 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2888 # extract uploader & filename from url
2889 uploader = mobj.group(1).decode('utf-8')
2890 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2892 # construct API request
2893 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2894 # retrieve .json file with links to files
2895 request = compat_urllib_request.Request(file_url)
2897 self.report_download_json(file_url)
2898 jsonData = compat_urllib_request.urlopen(request).read()
2899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2904 json_data = json.loads(jsonData)
2905 player_url = json_data['player_swf_url']
2906 formats = dict(json_data['audio_formats'])
2908 req_format = self._downloader.params.get('format', None)
2911 if self._downloader.params.get('listformats', None):
2912 self._print_formats(formats)
2915 if req_format is None or req_format == 'best':
2916 for format_param in formats.keys():
2917 url_list = self.get_urls(formats, format_param)
2919 file_url = self.check_urls(url_list)
2920 if file_url is not None:
2923 if req_format not in formats:
2924 self._downloader.trouble(u'ERROR: format is not available')
2927 url_list = self.get_urls(formats, req_format)
2928 file_url = self.check_urls(url_list)
2929 format_param = req_format
2932 'id': file_id.decode('utf-8'),
2933 'url': file_url.decode('utf-8'),
2934 'uploader': uploader.decode('utf-8'),
2935 'upload_date': None,
2936 'title': json_data['name'],
2937 'ext': file_url.split('.')[-1].decode('utf-8'),
2938 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2939 'thumbnail': json_data['thumbnail_url'],
2940 'description': json_data['description'],
2941 'player_url': player_url.decode('utf-8'),
2944 class StanfordOpenClassroomIE(InfoExtractor):
2945 """Information extractor for Stanford's Open ClassRoom"""
2947 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2948 IE_NAME = u'stanfordoc'
2950 def report_download_webpage(self, objid):
2951 """Report information extraction."""
2952 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2954 def report_extraction(self, video_id):
2955 """Report information extraction."""
2956 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2958 def _real_extract(self, url):
2959 mobj = re.match(self._VALID_URL, url)
2961 raise ExtractorError(u'Invalid URL: %s' % url)
2963 if mobj.group('course') and mobj.group('video'): # A specific video
2964 course = mobj.group('course')
2965 video = mobj.group('video')
2967 'id': course + '_' + video,
2969 'upload_date': None,
2972 self.report_extraction(info['id'])
2973 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2974 xmlUrl = baseUrl + video + '.xml'
2976 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2977 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2978 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2980 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2982 info['title'] = mdoc.findall('./title')[0].text
2983 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2985 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2987 info['ext'] = info['url'].rpartition('.')[2]
2989 elif mobj.group('course'): # A course page
2990 course = mobj.group('course')
2995 'upload_date': None,
2998 coursepage = self._download_webpage(url, info['id'],
2999 note='Downloading course info page',
3000 errnote='Unable to download course info page')
3002 m = re.search('<h1>([^<]+)</h1>', coursepage)
3004 info['title'] = unescapeHTML(m.group(1))
3006 info['title'] = info['id']
3008 m = re.search('<description>([^<]+)</description>', coursepage)
3010 info['description'] = unescapeHTML(m.group(1))
3012 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3015 'type': 'reference',
3016 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3020 for entry in info['list']:
3021 assert entry['type'] == 'reference'
3022 results += self.extract(entry['url'])
3026 'id': 'Stanford OpenClassroom',
3029 'upload_date': None,
3032 self.report_download_webpage(info['id'])
3033 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3035 rootpage = compat_urllib_request.urlopen(rootURL).read()
3036 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3040 info['title'] = info['id']
3042 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3045 'type': 'reference',
3046 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3051 for entry in info['list']:
3052 assert entry['type'] == 'reference'
3053 results += self.extract(entry['url'])
3056 class MTVIE(InfoExtractor):
3057 """Information extractor for MTV.com"""
3059 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3062 def report_extraction(self, video_id):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3066 def _real_extract(self, url):
3067 mobj = re.match(self._VALID_URL, url)
3069 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3071 if not mobj.group('proto'):
3072 url = 'http://' + url
3073 video_id = mobj.group('videoid')
3075 webpage = self._download_webpage(url, video_id)
3077 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3079 self._downloader.trouble(u'ERROR: unable to extract song name')
3081 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3082 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3084 self._downloader.trouble(u'ERROR: unable to extract performer')
3086 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3087 video_title = performer + ' - ' + song_name
3089 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3091 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3093 mtvn_uri = mobj.group(1)
3095 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3097 self._downloader.trouble(u'ERROR: unable to extract content id')
3099 content_id = mobj.group(1)
3101 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3102 self.report_extraction(video_id)
3103 request = compat_urllib_request.Request(videogen_url)
3105 metadataXml = compat_urllib_request.urlopen(request).read()
3106 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3107 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3110 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3111 renditions = mdoc.findall('.//rendition')
3113 # For now, always pick the highest quality.
3114 rendition = renditions[-1]
3117 _,_,ext = rendition.attrib['type'].partition('/')
3118 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3119 video_url = rendition.find('./src').text
3121 self._downloader.trouble('Invalid rendition field.')
3127 'uploader': performer,
3128 'upload_date': None,
3129 'title': video_title,
3137 class YoukuIE(InfoExtractor):
3138 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3140 def report_download_webpage(self, file_id):
3141 """Report webpage download."""
3142 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3144 def report_extraction(self, file_id):
3145 """Report information extraction."""
3146 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3149 nowTime = int(time.time() * 1000)
3150 random1 = random.randint(1000,1998)
3151 random2 = random.randint(1000,9999)
3153 return "%d%d%d" %(nowTime,random1,random2)
3155 def _get_file_ID_mix_string(self, seed):
3157 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3159 for i in range(len(source)):
3160 seed = (seed * 211 + 30031 ) % 65536
3161 index = math.floor(seed / 65536 * len(source) )
3162 mixed.append(source[int(index)])
3163 source.remove(source[int(index)])
3164 #return ''.join(mixed)
3167 def _get_file_id(self, fileId, seed):
3168 mixed = self._get_file_ID_mix_string(seed)
3169 ids = fileId.split('*')
3173 realId.append(mixed[int(ch)])
3174 return ''.join(realId)
3176 def _real_extract(self, url):
3177 mobj = re.match(self._VALID_URL, url)
3179 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3181 video_id = mobj.group('ID')
3183 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3185 request = compat_urllib_request.Request(info_url, None, std_headers)
3187 self.report_download_webpage(video_id)
3188 jsondata = compat_urllib_request.urlopen(request).read()
3189 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3190 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3193 self.report_extraction(video_id)
3195 jsonstr = jsondata.decode('utf-8')
3196 config = json.loads(jsonstr)
3198 video_title = config['data'][0]['title']
3199 seed = config['data'][0]['seed']
3201 format = self._downloader.params.get('format', None)
3202 supported_format = list(config['data'][0]['streamfileids'].keys())
3204 if format is None or format == 'best':
3205 if 'hd2' in supported_format:
3210 elif format == 'worst':
3218 fileid = config['data'][0]['streamfileids'][format]
3219 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3220 except (UnicodeDecodeError, ValueError, KeyError):
3221 self._downloader.trouble(u'ERROR: unable to extract info section')
3225 sid = self._gen_sid()
3226 fileid = self._get_file_id(fileid, seed)
3228 #column 8,9 of fileid represent the segment number
3229 #fileid[7:9] should be changed
3230 for index, key in enumerate(keys):
3232 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3233 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3236 'id': '%s_part%02d' % (video_id, index),
3237 'url': download_url,
3239 'upload_date': None,
3240 'title': video_title,
3243 files_info.append(info)
3248 class XNXXIE(InfoExtractor):
3249 """Information extractor for xnxx.com"""
3251 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3253 VIDEO_URL_RE = r'flv_url=(.*?)&'
3254 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3255 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3257 def report_webpage(self, video_id):
3258 """Report information extraction"""
3259 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3261 def report_extraction(self, video_id):
3262 """Report information extraction"""
3263 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3265 def _real_extract(self, url):
3266 mobj = re.match(self._VALID_URL, url)
3268 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3270 video_id = mobj.group(1)
3272 self.report_webpage(video_id)
3274 # Get webpage content
3276 webpage_bytes = compat_urllib_request.urlopen(url).read()
3277 webpage = webpage_bytes.decode('utf-8')
3278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3279 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3282 result = re.search(self.VIDEO_URL_RE, webpage)
3284 self._downloader.trouble(u'ERROR: unable to extract video url')
3286 video_url = compat_urllib_parse.unquote(result.group(1))
3288 result = re.search(self.VIDEO_TITLE_RE, webpage)
3290 self._downloader.trouble(u'ERROR: unable to extract video title')
3292 video_title = result.group(1)
3294 result = re.search(self.VIDEO_THUMB_RE, webpage)
3296 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3298 video_thumbnail = result.group(1)
3304 'upload_date': None,
3305 'title': video_title,
3307 'thumbnail': video_thumbnail,
3308 'description': None,
3312 class GooglePlusIE(InfoExtractor):
3313 """Information extractor for plus.google.com."""
3315 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3316 IE_NAME = u'plus.google'
3318 def __init__(self, downloader=None):
3319 InfoExtractor.__init__(self, downloader)
3321 def report_extract_entry(self, url):
3322 """Report downloading extry"""
3323 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3325 def report_date(self, upload_date):
3326 """Report downloading extry"""
3327 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3329 def report_uploader(self, uploader):
3330 """Report downloading extry"""
3331 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3333 def report_title(self, video_title):
3334 """Report downloading extry"""
3335 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3337 def report_extract_vid_page(self, video_page):
3338 """Report information extraction."""
3339 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3341 def _real_extract(self, url):
3342 # Extract id from URL
3343 mobj = re.match(self._VALID_URL, url)
3345 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3348 post_url = mobj.group(0)
3349 video_id = mobj.group(1)
3351 video_extension = 'flv'
3353 # Step 1, Retrieve post webpage to extract further information
3354 self.report_extract_entry(post_url)
3355 request = compat_urllib_request.Request(post_url)
3357 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3359 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3362 # Extract update date
3364 pattern = 'title="Timestamp">(.*?)</a>'
3365 mobj = re.search(pattern, webpage)
3367 upload_date = mobj.group(1)
3368 # Convert timestring to a format suitable for filename
3369 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3370 upload_date = upload_date.strftime('%Y%m%d')
3371 self.report_date(upload_date)
3375 pattern = r'rel\="author".*?>(.*?)</a>'
3376 mobj = re.search(pattern, webpage)
3378 uploader = mobj.group(1)
3379 self.report_uploader(uploader)
3382 # Get the first line for title
3384 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3385 mobj = re.search(pattern, webpage)
3387 video_title = mobj.group(1)
3388 self.report_title(video_title)
3390 # Step 2, Stimulate clicking the image box to launch video
3391 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3392 mobj = re.search(pattern, webpage)
3394 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3396 video_page = mobj.group(1)
3397 request = compat_urllib_request.Request(video_page)
3399 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3400 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3403 self.report_extract_vid_page(video_page)
3406 # Extract video links on video page
3407 """Extract video links of all sizes"""
3408 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3409 mobj = re.findall(pattern, webpage)
3411 self._downloader.trouble(u'ERROR: unable to extract video links')
3413 # Sort in resolution
3414 links = sorted(mobj)
3416 # Choose the lowest of the sort, i.e. highest resolution
3417 video_url = links[-1]
3418 # Only get the url. The resolution part in the tuple has no use anymore
3419 video_url = video_url[-1]
3420 # Treat escaped \u0026 style hex
3422 video_url = video_url.decode("unicode_escape")
3423 except AttributeError: # Python 3
3424 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3430 'uploader': uploader,
3431 'upload_date': upload_date,
3432 'title': video_title,
3433 'ext': video_extension,
3436 class NBAIE(InfoExtractor):
3437 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3440 def _real_extract(self, url):
3441 mobj = re.match(self._VALID_URL, url)
3443 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3446 video_id = mobj.group(1)
3447 if video_id.endswith('/index.html'):
3448 video_id = video_id[:-len('/index.html')]
3450 webpage = self._download_webpage(url, video_id)
3452 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3453 def _findProp(rexp, default=None):
3454 m = re.search(rexp, webpage)
3456 return unescapeHTML(m.group(1))
3460 shortened_video_id = video_id.rpartition('/')[2]
3461 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3463 'id': shortened_video_id,
3467 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3468 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3472 class JustinTVIE(InfoExtractor):
3473 """Information extractor for justin.tv and twitch.tv"""
3474 # TODO: One broadcast may be split into multiple videos. The key
3475 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3476 # starts at 1 and increases. Can we treat all parts as one video?
3478 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3479 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3480 _JUSTIN_PAGE_LIMIT = 100
3481 IE_NAME = u'justin.tv'
3483 def report_extraction(self, file_id):
3484 """Report information extraction."""
3485 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3487 def report_download_page(self, channel, offset):
3488 """Report attempt to download a single page of videos."""
3489 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3490 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3492 # Return count of items, list of *valid* items
3493 def _parse_page(self, url):
3495 urlh = compat_urllib_request.urlopen(url)
3496 webpage_bytes = urlh.read()
3497 webpage = webpage_bytes.decode('utf-8', 'ignore')
3498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3499 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3502 response = json.loads(webpage)
3503 if type(response) != list:
3504 error_text = response.get('error', 'unknown error')
3505 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3508 for clip in response:
3509 video_url = clip['video_file_url']
3511 video_extension = os.path.splitext(video_url)[1][1:]
3512 video_date = re.sub('-', '', clip['start_time'][:10])
3513 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3514 video_id = clip['id']
3515 video_title = clip.get('title', video_id)
3519 'title': video_title,
3520 'uploader': clip.get('channel_name', video_uploader_id),
3521 'uploader_id': video_uploader_id,
3522 'upload_date': video_date,
3523 'ext': video_extension,
3525 return (len(response), info)
3527 def _real_extract(self, url):
3528 mobj = re.match(self._VALID_URL, url)
3530 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3533 api = 'http://api.justin.tv'
3534 video_id = mobj.group(mobj.lastindex)
3536 if mobj.lastindex == 1:
3538 api += '/channel/archives/%s.json'
3540 api += '/broadcast/by_archive/%s.json'
3541 api = api % (video_id,)
3543 self.report_extraction(video_id)
3547 limit = self._JUSTIN_PAGE_LIMIT
3550 self.report_download_page(video_id, offset)
3551 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3552 page_count, page_info = self._parse_page(page_url)
3553 info.extend(page_info)
3554 if not paged or page_count != limit:
3559 class FunnyOrDieIE(InfoExtractor):
3560 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3562 def _real_extract(self, url):
3563 mobj = re.match(self._VALID_URL, url)
3565 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3568 video_id = mobj.group('id')
3569 webpage = self._download_webpage(url, video_id)
3571 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3573 self._downloader.trouble(u'ERROR: unable to find video information')
3574 video_url = unescapeHTML(m.group('url'))
3576 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3578 self._downloader.trouble(u'Cannot find video title')
3579 title = unescapeHTML(m.group('title'))
3581 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3583 desc = unescapeHTML(m.group('desc'))
3592 'description': desc,
3596 class SteamIE(InfoExtractor):
3597 _VALID_URL = r"""http://store.steampowered.com/
3598 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3600 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3604 def suitable(cls, url):
3605 """Receives a URL and returns True if suitable for this IE."""
3606 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3608 def _real_extract(self, url):
3609 m = re.match(self._VALID_URL, url, re.VERBOSE)
3610 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3611 gameID = m.group('gameID')
3612 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3613 webpage = self._download_webpage(videourl, gameID)
3614 mweb = re.finditer(urlRE, webpage)
3615 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3616 titles = re.finditer(namesRE, webpage)
3617 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3618 thumbs = re.finditer(thumbsRE, webpage)
3620 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3621 video_id = vid.group('videoID')
3622 title = vtitle.group('videoName')
3623 video_url = vid.group('videoURL')
3624 video_thumb = thumb.group('thumbnail')
3626 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3631 'title': unescapeHTML(title),
3632 'thumbnail': video_thumb
3637 class UstreamIE(InfoExtractor):
3638 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3639 IE_NAME = u'ustream'
3641 def _real_extract(self, url):
3642 m = re.match(self._VALID_URL, url)
3643 video_id = m.group('videoID')
3644 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3645 webpage = self._download_webpage(url, video_id)
3646 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3647 title = m.group('title')
3648 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3649 uploader = m.group('uploader')
3655 'uploader': uploader
3659 class RBMARadioIE(InfoExtractor):
3660 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3662 def _real_extract(self, url):
3663 m = re.match(self._VALID_URL, url)
3664 video_id = m.group('videoID')
3666 webpage = self._download_webpage(url, video_id)
3667 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3669 raise ExtractorError(u'Cannot find metadata')
3670 json_data = m.group(1)
3673 data = json.loads(json_data)
3674 except ValueError as e:
3675 raise ExtractorError(u'Invalid JSON: ' + str(e))
3677 video_url = data['akamai_url'] + '&cbr=256'
3678 url_parts = compat_urllib_parse_urlparse(video_url)
3679 video_ext = url_parts.path.rpartition('.')[2]
3684 'title': data['title'],
3685 'description': data.get('teaser_text'),
3686 'location': data.get('country_of_origin'),
3687 'uploader': data.get('host', {}).get('name'),
3688 'uploader_id': data.get('host', {}).get('slug'),
3689 'thumbnail': data.get('image', {}).get('large_url_2x'),
3690 'duration': data.get('duration'),
3695 class YouPornIE(InfoExtractor):
3696 """Information extractor for youporn.com."""
3697 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3699 def _print_formats(self, formats):
3700 """Print all available formats"""
3701 print(u'Available formats:')
3702 print(u'ext\t\tformat')
3703 print(u'---------------------------------')
3704 for format in formats:
3705 print(u'%s\t\t%s' % (format['ext'], format['format']))
3707 def _specific(self, req_format, formats):
3709 if(x["format"]==req_format):
3713 def _real_extract(self, url):
3714 mobj = re.match(self._VALID_URL, url)
3716 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3719 video_id = mobj.group('videoid')
3721 req = compat_urllib_request.Request(url)
3722 req.add_header('Cookie', 'age_verified=1')
3723 webpage = self._download_webpage(req, video_id)
3725 # Get the video title
3726 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3728 raise ExtractorError(u'Unable to extract video title')
3729 video_title = result.group('title').strip()
3731 # Get the video date
3732 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3734 self._downloader.report_warning(u'unable to extract video date')
3737 upload_date = result.group('date').strip()
3739 # Get the video uploader
3740 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3742 self._downloader.report_warning(u'unable to extract uploader')
3743 video_uploader = None
3745 video_uploader = result.group('uploader').strip()
3746 video_uploader = clean_html( video_uploader )
3748 # Get all of the formats available
3749 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3750 result = re.search(DOWNLOAD_LIST_RE, webpage)
3752 raise ExtractorError(u'Unable to extract download list')
3753 download_list_html = result.group('download_list').strip()
3755 # Get all of the links from the page
3756 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3757 links = re.findall(LINK_RE, download_list_html)
3758 if(len(links) == 0):
3759 raise ExtractorError(u'ERROR: no known formats available for video')
3761 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3766 # A link looks like this:
3767 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3768 # A path looks like this:
3769 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3770 video_url = unescapeHTML( link )
3771 path = compat_urllib_parse_urlparse( video_url ).path
3772 extension = os.path.splitext( path )[1][1:]
3773 format = path.split('/')[4].split('_')[:2]
3776 format = "-".join( format )
3777 title = u'%s-%s-%s' % (video_title, size, bitrate)
3782 'uploader': video_uploader,
3783 'upload_date': upload_date,
3788 'description': None,
3792 if self._downloader.params.get('listformats', None):
3793 self._print_formats(formats)
3796 req_format = self._downloader.params.get('format', None)
3797 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3799 if req_format is None or req_format == 'best':
3801 elif req_format == 'worst':
3802 return [formats[-1]]
3803 elif req_format in ('-1', 'all'):
3806 format = self._specific( req_format, formats )
3808 self._downloader.trouble(u'ERROR: requested format not available')
3814 class PornotubeIE(InfoExtractor):
3815 """Information extractor for pornotube.com."""
3816 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3818 def _real_extract(self, url):
3819 mobj = re.match(self._VALID_URL, url)
3821 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3824 video_id = mobj.group('videoid')
3825 video_title = mobj.group('title')
3827 # Get webpage content
3828 webpage = self._download_webpage(url, video_id)
3831 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3832 result = re.search(VIDEO_URL_RE, webpage)
3834 self._downloader.trouble(u'ERROR: unable to extract video url')
3836 video_url = compat_urllib_parse.unquote(result.group('url'))
3838 #Get the uploaded date
3839 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3840 result = re.search(VIDEO_UPLOADED_RE, webpage)
3842 self._downloader.trouble(u'ERROR: unable to extract video title')
3844 upload_date = result.group('date')
3846 info = {'id': video_id,
3849 'upload_date': upload_date,
3850 'title': video_title,
3856 class YouJizzIE(InfoExtractor):
3857 """Information extractor for youjizz.com."""
3858 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3860 def _real_extract(self, url):
3861 mobj = re.match(self._VALID_URL, url)
3863 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3866 video_id = mobj.group('videoid')
3868 # Get webpage content
3869 webpage = self._download_webpage(url, video_id)
3871 # Get the video title
3872 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3874 raise ExtractorError(u'ERROR: unable to extract video title')
3875 video_title = result.group('title').strip()
3877 # Get the embed page
3878 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3880 raise ExtractorError(u'ERROR: unable to extract embed page')
3882 embed_page_url = result.group(0).strip()
3883 video_id = result.group('videoid')
3885 webpage = self._download_webpage(embed_page_url, video_id)
3888 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3890 raise ExtractorError(u'ERROR: unable to extract video url')
3891 video_url = result.group('source')
3893 info = {'id': video_id,
3895 'title': video_title,
3898 'player_url': embed_page_url}
3902 class EightTracksIE(InfoExtractor):
3904 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3906 def _real_extract(self, url):
3907 mobj = re.match(self._VALID_URL, url)
3909 raise ExtractorError(u'Invalid URL: %s' % url)
3910 playlist_id = mobj.group('id')
3912 webpage = self._download_webpage(url, playlist_id)
3914 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3916 raise ExtractorError(u'Cannot find trax information')
3917 json_like = m.group(1)
3918 data = json.loads(json_like)
3920 session = str(random.randint(0, 1000000000))
3922 track_count = data['tracks_count']
3923 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3924 next_url = first_url
3926 for i in itertools.count():
3927 api_json = self._download_webpage(next_url, playlist_id,
3928 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3929 errnote=u'Failed to download song information')
3930 api_data = json.loads(api_json)
3931 track_data = api_data[u'set']['track']
3933 'id': track_data['id'],
3934 'url': track_data['track_file_stream_url'],
3935 'title': track_data['performer'] + u' - ' + track_data['name'],
3936 'raw_title': track_data['name'],
3937 'uploader_id': data['user']['login'],
3941 if api_data['set']['at_last_track']:
3943 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3946 class KeekIE(InfoExtractor):
3947 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3950 def _real_extract(self, url):
3951 m = re.match(self._VALID_URL, url)
3952 video_id = m.group('videoID')
3953 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3954 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3955 webpage = self._download_webpage(url, video_id)
3956 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3957 title = unescapeHTML(m.group('title'))
3958 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3959 uploader = unescapeHTML(m.group('uploader'))
3965 'thumbnail': thumbnail,
3966 'uploader': uploader
3970 class TEDIE(InfoExtractor):
3971 _VALID_URL=r'''http://www.ted.com/
3973 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3975 ((?P<type_talk>talks)) # We have a simple talk
3977 /(?P<name>\w+) # Here goes the name and then ".html"
3981 def suitable(cls, url):
3982 """Receives a URL and returns True if suitable for this IE."""
3983 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3985 def _real_extract(self, url):
3986 m=re.match(self._VALID_URL, url, re.VERBOSE)
3987 if m.group('type_talk'):
3988 return [self._talk_info(url)]
3990 playlist_id=m.group('playlist_id')
3991 name=m.group('name')
3992 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3993 return self._playlist_videos_info(url,name,playlist_id)
3995 def _talk_video_link(self,mediaSlug):
3996 '''Returns the video link for that mediaSlug'''
3997 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3999 def _playlist_videos_info(self,url,name,playlist_id=0):
4000 '''Returns the videos of the playlist'''
4002 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4003 ([.\s]*?)data-playlist_item_id="(\d+)"
4004 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4006 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4007 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4008 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4009 m_names=re.finditer(video_name_RE,webpage)
4011 for m_video, m_name in zip(m_videos,m_names):
4012 video_id=m_video.group('video_id')
4013 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4014 info.append(self._talk_info(talk_url,video_id))
4017 def _talk_info(self, url, video_id=0):
4018 """Return the video for the talk in the url"""
4019 m=re.match(self._VALID_URL, url,re.VERBOSE)
4020 videoName=m.group('name')
4021 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4022 # If the url includes the language we get the title translated
4023 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4024 title=re.search(title_RE, webpage).group('title')
4025 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4026 "id":(?P<videoID>[\d]+).*?
4027 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4028 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4029 thumb_match=re.search(thumb_RE,webpage)
4030 info_match=re.search(info_RE,webpage,re.VERBOSE)
4031 video_id=info_match.group('videoID')
4032 mediaSlug=info_match.group('mediaSlug')
4033 video_url=self._talk_video_link(mediaSlug)
4039 'thumbnail': thumb_match.group('thumbnail')
4043 class MySpassIE(InfoExtractor):
4044 _VALID_URL = r'http://www.myspass.de/.*'
4046 def _real_extract(self, url):
4047 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4049 # video id is the last path element of the URL
4050 # usually there is a trailing slash, so also try the second but last
4051 url_path = compat_urllib_parse_urlparse(url).path
4052 url_parent_path, video_id = os.path.split(url_path)
4054 _, video_id = os.path.split(url_parent_path)
4057 metadata_url = META_DATA_URL_TEMPLATE % video_id
4058 metadata_text = self._download_webpage(metadata_url, video_id)
4059 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4061 # extract values from metadata
4062 url_flv_el = metadata.find('url_flv')
4063 if url_flv_el is None:
4064 self._downloader.trouble(u'ERROR: unable to extract download url')
4066 video_url = url_flv_el.text
4067 extension = os.path.splitext(video_url)[1][1:]
4068 title_el = metadata.find('title')
4069 if title_el is None:
4070 self._downloader.trouble(u'ERROR: unable to extract title')
4072 title = title_el.text
4073 format_id_el = metadata.find('format_id')
4074 if format_id_el is None:
4077 format = format_id_el.text
4078 description_el = metadata.find('description')
4079 if description_el is not None:
4080 description = description_el.text
4083 imagePreview_el = metadata.find('imagePreview')
4084 if imagePreview_el is not None:
4085 thumbnail = imagePreview_el.text
4094 'thumbnail': thumbnail,
4095 'description': description
4099 class SpiegelIE(InfoExtractor):
4100 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)$'
4102 def _real_extract(self, url):
4103 m = re.match(self._VALID_URL, url)
4104 video_id = m.group('videoID')
4106 webpage = self._download_webpage(url, video_id)
4107 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4109 raise ExtractorError(u'Cannot find title')
4110 video_title = unescapeHTML(m.group(1))
4112 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4113 xml_code = self._download_webpage(xml_url, video_id,
4114 note=u'Downloading XML', errnote=u'Failed to download XML')
4116 idoc = xml.etree.ElementTree.fromstring(xml_code)
4117 last_type = idoc[-1]
4118 filename = last_type.findall('./filename')[0].text
4119 duration = float(last_type.findall('./duration')[0].text)
4121 video_url = 'http://video2.spiegel.de/flash/' + filename
4122 video_ext = filename.rpartition('.')[2]
4127 'title': video_title,
4128 'duration': duration,
4133 def gen_extractors():
4134 """ Return a list of an instance of every supported extractor.
4135 The order does matter; the first extractor matched is the one handling the URL.
4138 YoutubePlaylistIE(),
4162 StanfordOpenClassroomIE(),