2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
77 def suitable(self, url):
78 """Receives a URL and returns True if suitable for this IE."""
79 return re.match(self._VALID_URL, url) is not None
82 """Getter method for _WORKING."""
86 """Initializes an instance (authentication, etc)."""
88 self._real_initialize()
91 def extract(self, url):
92 """Extracts URL information and returns it in list of dicts."""
94 return self._real_extract(url)
96 def set_downloader(self, downloader):
97 """Sets the downloader for this IE."""
98 self._downloader = downloader
100 def _real_initialize(self):
101 """Real initialization process. Redefine in subclasses."""
104 def _real_extract(self, url):
105 """Real extraction process. Redefine in subclasses."""
110 return type(self).__name__[:-2]
112 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
113 """ Returns the response handle """
115 note = u'Downloading video webpage'
116 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
118 return compat_urllib_request.urlopen(url_or_request)
119 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
121 errnote = u'Unable to download webpage'
122 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
124 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
125 """ Returns the data of the page as a string """
126 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
127 webpage_bytes = urlh.read()
128 return webpage_bytes.decode('utf-8', 'replace')
131 class YoutubeIE(InfoExtractor):
132 """Information extractor for youtube.com."""
136 (?:https?://)? # http(s):// (optional)
137 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
138 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
139 (?:.*?\#/)? # handle anchor (#/) redirect urls
140 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
141 (?: # the various things that can precede the ID:
142 (?:(?:v|embed|e)/) # v/ or embed/ or e/
143 |(?: # or the v= param in all its forms
144 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
145 (?:\?|\#!?) # the params delimiter ? or # or #!
146 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
149 )? # optional -> youtube.com/xxxx is OK
150 )? # all until now is optional -> you can pass the naked ID
151 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
152 (?(1).+)? # if we found the ID, everything can follow
154 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
155 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
156 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
157 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
158 _NETRC_MACHINE = 'youtube'
159 # Listed in order of quality
160 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
161 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
162 _video_extensions = {
168 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
174 _video_dimensions = {
192 def suitable(self, url):
193 """Receives a URL and returns True if suitable for this IE."""
194 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
196 def report_lang(self):
197 """Report attempt to set language."""
198 self._downloader.to_screen(u'[youtube] Setting language')
200 def report_login(self):
201 """Report attempt to log in."""
202 self._downloader.to_screen(u'[youtube] Logging in')
204 def report_age_confirmation(self):
205 """Report attempt to confirm age."""
206 self._downloader.to_screen(u'[youtube] Confirming age')
208 def report_video_webpage_download(self, video_id):
209 """Report attempt to download video webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
212 def report_video_info_webpage_download(self, video_id):
213 """Report attempt to download video info webpage."""
214 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
216 def report_video_subtitles_download(self, video_id):
217 """Report attempt to download video info webpage."""
218 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
220 def report_information_extraction(self, video_id):
221 """Report attempt to extract video information."""
222 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
224 def report_unavailable_format(self, video_id, format):
225 """Report extracted video URL."""
226 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
228 def report_rtmp_download(self):
229 """Indicate the download will use the RTMP protocol."""
230 self._downloader.to_screen(u'[youtube] RTMP download detected')
232 def _closed_captions_xml_to_srt(self, xml_string):
234 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
235 # TODO parse xml instead of regex
236 for n, (start, dur_tag, dur, caption) in enumerate(texts):
237 if not dur: dur = '4'
239 end = start + float(dur)
240 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
241 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
242 caption = unescapeHTML(caption)
243 caption = unescapeHTML(caption) # double cycle, intentional
244 srt += str(n+1) + '\n'
245 srt += start + ' --> ' + end + '\n'
246 srt += caption + '\n\n'
249 def _extract_subtitles(self, video_id):
250 self.report_video_subtitles_download(video_id)
251 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
253 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
254 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
255 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
256 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
257 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
258 if not srt_lang_list:
259 return (u'WARNING: video has no closed captions', None)
260 if self._downloader.params.get('subtitleslang', False):
261 srt_lang = self._downloader.params.get('subtitleslang')
262 elif 'en' in srt_lang_list:
265 srt_lang = list(srt_lang_list.keys())[0]
266 if not srt_lang in srt_lang_list:
267 return (u'WARNING: no closed captions found in the specified language', None)
268 params = compat_urllib_parse.urlencode({
270 'name': srt_lang_list[srt_lang].encode('utf-8'),
273 url = 'http://www.youtube.com/api/timedtext?' + params
275 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
277 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
279 return (u'WARNING: Did not fetch video subtitles', None)
280 return (None, self._closed_captions_xml_to_srt(srt_xml))
282 def _print_formats(self, formats):
283 print('Available formats:')
285 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
287 def _real_initialize(self):
288 if self._downloader is None:
293 downloader_params = self._downloader.params
295 # Attempt to use provided username and password or .netrc data
296 if downloader_params.get('username', None) is not None:
297 username = downloader_params['username']
298 password = downloader_params['password']
299 elif downloader_params.get('usenetrc', False):
301 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
306 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
307 except (IOError, netrc.NetrcParseError) as err:
308 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
312 request = compat_urllib_request.Request(self._LANG_URL)
315 compat_urllib_request.urlopen(request).read()
316 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
317 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
320 # No authentication to be performed
324 request = compat_urllib_request.Request(self._LOGIN_URL)
326 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
327 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
328 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
333 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
335 galx = match.group(1)
337 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
343 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
347 u'PersistentCookie': u'yes',
349 u'bgresponse': u'js_disabled',
350 u'checkConnection': u'',
351 u'checkedDomains': u'youtube',
357 u'signIn': u'Sign in',
359 u'service': u'youtube',
363 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
365 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
366 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
367 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
370 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
371 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
372 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
381 'action_confirm': 'Confirm',
383 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
385 self.report_age_confirmation()
386 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
391 def _extract_id(self, url):
392 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
394 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
396 video_id = mobj.group(2)
399 def _real_extract(self, url):
400 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
401 mobj = re.search(self._NEXT_URL_RE, url)
403 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
404 video_id = self._extract_id(url)
407 self.report_video_webpage_download(video_id)
408 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
409 request = compat_urllib_request.Request(url)
411 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
412 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
413 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
416 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
418 # Attempt to extract SWF player URL
419 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
421 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
426 self.report_video_info_webpage_download(video_id)
427 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
428 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
429 % (video_id, el_type))
430 request = compat_urllib_request.Request(video_info_url)
432 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
433 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
434 video_info = compat_parse_qs(video_info_webpage)
435 if 'token' in video_info:
437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
438 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
440 if 'token' not in video_info:
441 if 'reason' in video_info:
442 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
444 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
447 # Check for "rental" videos
448 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
449 self._downloader.trouble(u'ERROR: "rental" videos not supported')
452 # Start extracting information
453 self.report_information_extraction(video_id)
456 if 'author' not in video_info:
457 self._downloader.trouble(u'ERROR: unable to extract uploader name')
459 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
462 video_uploader_id = None
463 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
465 video_uploader_id = mobj.group(1)
467 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
470 if 'title' not in video_info:
471 self._downloader.trouble(u'ERROR: unable to extract video title')
473 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
476 if 'thumbnail_url' not in video_info:
477 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
479 else: # don't panic if we can't find it
480 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
484 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
486 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
487 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
488 for expression in format_expressions:
490 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
495 video_description = get_element_by_id("eow-description", video_webpage)
496 if video_description:
497 video_description = clean_html(video_description)
499 video_description = ''
502 video_subtitles = None
503 if self._downloader.params.get('writesubtitles', False):
504 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
506 self._downloader.trouble(srt_error)
508 if 'length_seconds' not in video_info:
509 self._downloader.trouble(u'WARNING: unable to extract video duration')
512 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
515 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
517 # Decide which formats to download
518 req_format = self._downloader.params.get('format', None)
520 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
521 self.report_rtmp_download()
522 video_url_list = [(None, video_info['conn'][0])]
523 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
524 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
525 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
526 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
527 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
529 format_limit = self._downloader.params.get('format_limit', None)
530 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
531 if format_limit is not None and format_limit in available_formats:
532 format_list = available_formats[available_formats.index(format_limit):]
534 format_list = available_formats
535 existing_formats = [x for x in format_list if x in url_map]
536 if len(existing_formats) == 0:
537 self._downloader.trouble(u'ERROR: no known formats available for video')
539 if self._downloader.params.get('listformats', None):
540 self._print_formats(existing_formats)
542 if req_format is None or req_format == 'best':
543 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
544 elif req_format == 'worst':
545 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
546 elif req_format in ('-1', 'all'):
547 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
549 # Specific formats. We pick the first in a slash-delimeted sequence.
550 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
551 req_formats = req_format.split('/')
552 video_url_list = None
553 for rf in req_formats:
555 video_url_list = [(rf, url_map[rf])]
557 if video_url_list is None:
558 self._downloader.trouble(u'ERROR: requested format not available')
561 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
565 for format_param, video_real_url in video_url_list:
567 video_extension = self._video_extensions.get(format_param, 'flv')
569 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
570 self._video_dimensions.get(format_param, '???'))
574 'url': video_real_url,
575 'uploader': video_uploader,
576 'uploader_id': video_uploader_id,
577 'upload_date': upload_date,
578 'title': video_title,
579 'ext': video_extension,
580 'format': video_format,
581 'thumbnail': video_thumbnail,
582 'description': video_description,
583 'player_url': player_url,
584 'subtitles': video_subtitles,
585 'duration': video_duration
590 class MetacafeIE(InfoExtractor):
591 """Information Extractor for metacafe.com."""
593 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
594 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
595 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
596 IE_NAME = u'metacafe'
598 def __init__(self, downloader=None):
599 InfoExtractor.__init__(self, downloader)
601 def report_disclaimer(self):
602 """Report disclaimer retrieval."""
603 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
605 def report_age_confirmation(self):
606 """Report attempt to confirm age."""
607 self._downloader.to_screen(u'[metacafe] Confirming age')
609 def report_download_webpage(self, video_id):
610 """Report webpage download."""
611 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
613 def report_extraction(self, video_id):
614 """Report information extraction."""
615 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
617 def _real_initialize(self):
618 # Retrieve disclaimer
619 request = compat_urllib_request.Request(self._DISCLAIMER)
621 self.report_disclaimer()
622 disclaimer = compat_urllib_request.urlopen(request).read()
623 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
624 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
630 'submit': "Continue - I'm over 18",
632 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
634 self.report_age_confirmation()
635 disclaimer = compat_urllib_request.urlopen(request).read()
636 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
637 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
640 def _real_extract(self, url):
641 # Extract id and simplified title from URL
642 mobj = re.match(self._VALID_URL, url)
644 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
647 video_id = mobj.group(1)
649 # Check if video comes from YouTube
650 mobj2 = re.match(r'^yt-(.*)$', video_id)
651 if mobj2 is not None:
652 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
655 # Retrieve video webpage to extract further information
656 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
658 self.report_download_webpage(video_id)
659 webpage = compat_urllib_request.urlopen(request).read()
660 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
661 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
664 # Extract URL, uploader and title from webpage
665 self.report_extraction(video_id)
666 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
668 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
669 video_extension = mediaURL[-3:]
671 # Extract gdaKey if available
672 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
676 gdaKey = mobj.group(1)
677 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
679 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
681 self._downloader.trouble(u'ERROR: unable to extract media URL')
683 vardict = compat_parse_qs(mobj.group(1))
684 if 'mediaData' not in vardict:
685 self._downloader.trouble(u'ERROR: unable to extract media URL')
687 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
689 self._downloader.trouble(u'ERROR: unable to extract media URL')
691 mediaURL = mobj.group(1).replace('\\/', '/')
692 video_extension = mediaURL[-3:]
693 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
695 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
697 self._downloader.trouble(u'ERROR: unable to extract title')
699 video_title = mobj.group(1).decode('utf-8')
701 mobj = re.search(r'submitter=(.*?);', webpage)
703 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
705 video_uploader = mobj.group(1)
708 'id': video_id.decode('utf-8'),
709 'url': video_url.decode('utf-8'),
710 'uploader': video_uploader.decode('utf-8'),
712 'title': video_title,
713 'ext': video_extension.decode('utf-8'),
717 class DailymotionIE(InfoExtractor):
718 """Information Extractor for Dailymotion"""
720 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
721 IE_NAME = u'dailymotion'
724 def __init__(self, downloader=None):
725 InfoExtractor.__init__(self, downloader)
727 def report_extraction(self, video_id):
728 """Report information extraction."""
729 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
731 def _real_extract(self, url):
732 # Extract id and simplified title from URL
733 mobj = re.match(self._VALID_URL, url)
735 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
738 video_id = mobj.group(1).split('_')[0].split('?')[0]
740 video_extension = 'mp4'
742 # Retrieve video webpage to extract further information
743 request = compat_urllib_request.Request(url)
744 request.add_header('Cookie', 'family_filter=off')
745 webpage = self._download_webpage(request, video_id)
747 # Extract URL, uploader and title from webpage
748 self.report_extraction(video_id)
749 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
751 self._downloader.trouble(u'ERROR: unable to extract media URL')
753 flashvars = compat_urllib_parse.unquote(mobj.group(1))
755 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
758 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
761 self._downloader.trouble(u'ERROR: unable to extract video URL')
764 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
766 self._downloader.trouble(u'ERROR: unable to extract video URL')
769 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
771 # TODO: support choosing qualities
773 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
775 self._downloader.trouble(u'ERROR: unable to extract title')
777 video_title = unescapeHTML(mobj.group('title'))
779 video_uploader = None
780 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
782 # lookin for official user
783 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
784 if mobj_official is None:
785 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
787 video_uploader = mobj_official.group(1)
789 video_uploader = mobj.group(1)
791 video_upload_date = None
792 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
794 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
799 'uploader': video_uploader,
800 'upload_date': video_upload_date,
801 'title': video_title,
802 'ext': video_extension,
806 class PhotobucketIE(InfoExtractor):
807 """Information extractor for photobucket.com."""
809 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
810 IE_NAME = u'photobucket'
812 def __init__(self, downloader=None):
813 InfoExtractor.__init__(self, downloader)
815 def report_download_webpage(self, video_id):
816 """Report webpage download."""
817 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
819 def report_extraction(self, video_id):
820 """Report information extraction."""
821 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
823 def _real_extract(self, url):
824 # Extract id from URL
825 mobj = re.match(self._VALID_URL, url)
827 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
830 video_id = mobj.group(1)
832 video_extension = 'flv'
834 # Retrieve video webpage to extract further information
835 request = compat_urllib_request.Request(url)
837 self.report_download_webpage(video_id)
838 webpage = compat_urllib_request.urlopen(request).read()
839 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
840 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
843 # Extract URL, uploader, and title from webpage
844 self.report_extraction(video_id)
845 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
847 self._downloader.trouble(u'ERROR: unable to extract media URL')
849 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
853 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
855 self._downloader.trouble(u'ERROR: unable to extract title')
857 video_title = mobj.group(1).decode('utf-8')
859 video_uploader = mobj.group(2).decode('utf-8')
862 'id': video_id.decode('utf-8'),
863 'url': video_url.decode('utf-8'),
864 'uploader': video_uploader,
866 'title': video_title,
867 'ext': video_extension.decode('utf-8'),
871 class YahooIE(InfoExtractor):
872 """Information extractor for video.yahoo.com."""
875 # _VALID_URL matches all Yahoo! Video URLs
876 # _VPAGE_URL matches only the extractable '/watch/' URLs
877 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
878 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
879 IE_NAME = u'video.yahoo'
881 def __init__(self, downloader=None):
882 InfoExtractor.__init__(self, downloader)
884 def report_download_webpage(self, video_id):
885 """Report webpage download."""
886 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
888 def report_extraction(self, video_id):
889 """Report information extraction."""
890 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
892 def _real_extract(self, url, new_video=True):
893 # Extract ID from URL
894 mobj = re.match(self._VALID_URL, url)
896 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
899 video_id = mobj.group(2)
900 video_extension = 'flv'
902 # Rewrite valid but non-extractable URLs as
903 # extractable English language /watch/ URLs
904 if re.match(self._VPAGE_URL, url) is None:
905 request = compat_urllib_request.Request(url)
907 webpage = compat_urllib_request.urlopen(request).read()
908 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
909 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
912 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
914 self._downloader.trouble(u'ERROR: Unable to extract id field')
916 yahoo_id = mobj.group(1)
918 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
920 self._downloader.trouble(u'ERROR: Unable to extract vid field')
922 yahoo_vid = mobj.group(1)
924 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
925 return self._real_extract(url, new_video=False)
927 # Retrieve video webpage to extract further information
928 request = compat_urllib_request.Request(url)
930 self.report_download_webpage(video_id)
931 webpage = compat_urllib_request.urlopen(request).read()
932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
933 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
936 # Extract uploader and title from webpage
937 self.report_extraction(video_id)
938 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
940 self._downloader.trouble(u'ERROR: unable to extract video title')
942 video_title = mobj.group(1).decode('utf-8')
944 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
946 self._downloader.trouble(u'ERROR: unable to extract video uploader')
948 video_uploader = mobj.group(1).decode('utf-8')
950 # Extract video thumbnail
951 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
953 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
955 video_thumbnail = mobj.group(1).decode('utf-8')
957 # Extract video description
958 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
960 self._downloader.trouble(u'ERROR: unable to extract video description')
962 video_description = mobj.group(1).decode('utf-8')
963 if not video_description:
964 video_description = 'No description available.'
966 # Extract video height and width
967 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
969 self._downloader.trouble(u'ERROR: unable to extract video height')
971 yv_video_height = mobj.group(1)
973 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
975 self._downloader.trouble(u'ERROR: unable to extract video width')
977 yv_video_width = mobj.group(1)
979 # Retrieve video playlist to extract media URL
980 # I'm not completely sure what all these options are, but we
981 # seem to need most of them, otherwise the server sends a 401.
982 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
983 yv_bitrate = '700' # according to Wikipedia this is hard-coded
984 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
985 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
986 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
988 self.report_download_webpage(video_id)
989 webpage = compat_urllib_request.urlopen(request).read()
990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
994 # Extract media URL from playlist XML
995 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
997 self._downloader.trouble(u'ERROR: Unable to extract media URL')
999 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1000 video_url = unescapeHTML(video_url)
1003 'id': video_id.decode('utf-8'),
1005 'uploader': video_uploader,
1006 'upload_date': None,
1007 'title': video_title,
1008 'ext': video_extension.decode('utf-8'),
1009 'thumbnail': video_thumbnail.decode('utf-8'),
1010 'description': video_description,
1014 class VimeoIE(InfoExtractor):
1015 """Information extractor for vimeo.com."""
1017 # _VALID_URL matches Vimeo URLs
1018 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1021 def __init__(self, downloader=None):
1022 InfoExtractor.__init__(self, downloader)
1024 def report_download_webpage(self, video_id):
1025 """Report webpage download."""
1026 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1028 def report_extraction(self, video_id):
1029 """Report information extraction."""
1030 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1032 def _real_extract(self, url, new_video=True):
1033 # Extract ID from URL
1034 mobj = re.match(self._VALID_URL, url)
1036 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1039 video_id = mobj.group('id')
1040 if not mobj.group('proto'):
1041 url = 'https://' + url
1042 if mobj.group('direct_link'):
1043 url = 'https://vimeo.com/' + video_id
1045 # Retrieve video webpage to extract further information
1046 request = compat_urllib_request.Request(url, None, std_headers)
1048 self.report_download_webpage(video_id)
1049 webpage_bytes = compat_urllib_request.urlopen(request).read()
1050 webpage = webpage_bytes.decode('utf-8')
1051 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1052 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1055 # Now we begin extracting as much information as we can from what we
1056 # retrieved. First we extract the information common to all extractors,
1057 # and latter we extract those that are Vimeo specific.
1058 self.report_extraction(video_id)
1060 # Extract the config JSON
1062 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1063 config = json.loads(config)
1065 self._downloader.trouble(u'ERROR: unable to extract info section')
1069 video_title = config["video"]["title"]
1071 # Extract uploader and uploader_id
1072 video_uploader = config["video"]["owner"]["name"]
1073 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1075 # Extract video thumbnail
1076 video_thumbnail = config["video"]["thumbnail"]
1078 # Extract video description
1079 video_description = get_element_by_attribute("itemprop", "description", webpage)
1080 if video_description: video_description = clean_html(video_description)
1081 else: video_description = ''
1083 # Extract upload date
1084 video_upload_date = None
1085 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1086 if mobj is not None:
1087 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1089 # Vimeo specific: extract request signature and timestamp
1090 sig = config['request']['signature']
1091 timestamp = config['request']['timestamp']
1093 # Vimeo specific: extract video codec and quality information
1094 # First consider quality, then codecs, then take everything
1095 # TODO bind to format param
1096 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1097 files = { 'hd': [], 'sd': [], 'other': []}
1098 for codec_name, codec_extension in codecs:
1099 if codec_name in config["video"]["files"]:
1100 if 'hd' in config["video"]["files"][codec_name]:
1101 files['hd'].append((codec_name, codec_extension, 'hd'))
1102 elif 'sd' in config["video"]["files"][codec_name]:
1103 files['sd'].append((codec_name, codec_extension, 'sd'))
1105 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1107 for quality in ('hd', 'sd', 'other'):
1108 if len(files[quality]) > 0:
1109 video_quality = files[quality][0][2]
1110 video_codec = files[quality][0][0]
1111 video_extension = files[quality][0][1]
1112 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1115 self._downloader.trouble(u'ERROR: no known codec found')
1118 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1119 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1124 'uploader': video_uploader,
1125 'uploader_id': video_uploader_id,
1126 'upload_date': video_upload_date,
1127 'title': video_title,
1128 'ext': video_extension,
1129 'thumbnail': video_thumbnail,
1130 'description': video_description,
1134 class ArteTvIE(InfoExtractor):
1135 """arte.tv information extractor."""
1137 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1138 _LIVE_URL = r'index-[0-9]+\.html$'
1140 IE_NAME = u'arte.tv'
1142 def __init__(self, downloader=None):
1143 InfoExtractor.__init__(self, downloader)
1145 def report_download_webpage(self, video_id):
1146 """Report webpage download."""
1147 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1149 def report_extraction(self, video_id):
1150 """Report information extraction."""
1151 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1153 def fetch_webpage(self, url):
1154 request = compat_urllib_request.Request(url)
1156 self.report_download_webpage(url)
1157 webpage = compat_urllib_request.urlopen(request).read()
1158 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1159 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1161 except ValueError as err:
1162 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1167 page = self.fetch_webpage(url)
1168 mobj = re.search(regex, page, regexFlags)
1172 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175 for (i, key, err) in matchTuples:
1176 if mobj.group(i) is None:
1177 self._downloader.trouble(err)
1180 info[key] = mobj.group(i)
1184 def extractLiveStream(self, url):
1185 video_lang = url.split('/')[-4]
1186 info = self.grep_webpage(
1188 r'src="(.*?/videothek_js.*?\.js)',
1191 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1194 http_host = url.split('/')[2]
1195 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1196 info = self.grep_webpage(
1198 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1199 '(http://.*?\.swf).*?' +
1203 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1204 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1205 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1208 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1210 def extractPlus7Stream(self, url):
1211 video_lang = url.split('/')[-3]
1212 info = self.grep_webpage(
1214 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1217 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1220 next_url = compat_urllib_parse.unquote(info.get('url'))
1221 info = self.grep_webpage(
1223 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1226 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1229 next_url = compat_urllib_parse.unquote(info.get('url'))
1231 info = self.grep_webpage(
1233 r'<video id="(.*?)".*?>.*?' +
1234 '<name>(.*?)</name>.*?' +
1235 '<dateVideo>(.*?)</dateVideo>.*?' +
1236 '<url quality="hd">(.*?)</url>',
1239 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1240 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1241 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1242 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1247 'id': info.get('id'),
1248 'url': compat_urllib_parse.unquote(info.get('url')),
1249 'uploader': u'arte.tv',
1250 'upload_date': info.get('date'),
1251 'title': info.get('title').decode('utf-8'),
1257 def _real_extract(self, url):
1258 video_id = url.split('/')[-1]
1259 self.report_extraction(video_id)
1261 if re.search(self._LIVE_URL, video_id) is not None:
1262 self.extractLiveStream(url)
1265 info = self.extractPlus7Stream(url)
1270 class GenericIE(InfoExtractor):
1271 """Generic last-resort information extractor."""
1274 IE_NAME = u'generic'
1276 def __init__(self, downloader=None):
1277 InfoExtractor.__init__(self, downloader)
1279 def report_download_webpage(self, video_id):
1280 """Report webpage download."""
1281 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1282 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1284 def report_extraction(self, video_id):
1285 """Report information extraction."""
1286 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1288 def report_following_redirect(self, new_url):
1289 """Report information extraction."""
1290 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1292 def _test_redirect(self, url):
1293 """Check if it is a redirect, like url shorteners, in case restart chain."""
1294 class HeadRequest(compat_urllib_request.Request):
1295 def get_method(self):
1298 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1300 Subclass the HTTPRedirectHandler to make it use our
1301 HeadRequest also on the redirected URL
1303 def redirect_request(self, req, fp, code, msg, headers, newurl):
1304 if code in (301, 302, 303, 307):
1305 newurl = newurl.replace(' ', '%20')
1306 newheaders = dict((k,v) for k,v in req.headers.items()
1307 if k.lower() not in ("content-length", "content-type"))
1308 return HeadRequest(newurl,
1310 origin_req_host=req.get_origin_req_host(),
1313 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1315 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1317 Fallback to GET if HEAD is not allowed (405 HTTP error)
1319 def http_error_405(self, req, fp, code, msg, headers):
1323 newheaders = dict((k,v) for k,v in req.headers.items()
1324 if k.lower() not in ("content-length", "content-type"))
1325 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1327 origin_req_host=req.get_origin_req_host(),
1331 opener = compat_urllib_request.OpenerDirector()
1332 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1333 HTTPMethodFallback, HEADRedirectHandler,
1334 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1335 opener.add_handler(handler())
1337 response = opener.open(HeadRequest(url))
1338 new_url = response.geturl()
1343 self.report_following_redirect(new_url)
1344 self._downloader.download([new_url])
1347 def _real_extract(self, url):
1348 if self._test_redirect(url): return
1350 video_id = url.split('/')[-1]
1351 request = compat_urllib_request.Request(url)
1353 self.report_download_webpage(video_id)
1354 webpage = compat_urllib_request.urlopen(request).read()
1355 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1356 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1358 except ValueError as err:
1359 # since this is the last-resort InfoExtractor, if
1360 # this error is thrown, it'll be thrown here
1361 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1364 self.report_extraction(video_id)
1365 # Start with something easy: JW Player in SWFObject
1366 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1368 # Broaden the search a little bit
1369 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1371 # Broaden the search a little bit: JWPlayer JS loader
1372 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1374 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377 # It's possible that one of the regexes
1378 # matched, but returned an empty group:
1379 if mobj.group(1) is None:
1380 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1383 video_url = compat_urllib_parse.unquote(mobj.group(1))
1384 video_id = os.path.basename(video_url)
1386 # here's a fun little line of code for you:
1387 video_extension = os.path.splitext(video_id)[1][1:]
1388 video_id = os.path.splitext(video_id)[0]
1390 # it's tempting to parse this further, but you would
1391 # have to take into account all the variations like
1392 # Video Title - Site Name
1393 # Site Name | Video Title
1394 # Video Title - Tagline | Site Name
1395 # and so on and so forth; it's just not practical
1396 mobj = re.search(r'<title>(.*)</title>', webpage)
1398 self._downloader.trouble(u'ERROR: unable to extract title')
1400 video_title = mobj.group(1)
1402 # video uploader is domain name
1403 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1405 self._downloader.trouble(u'ERROR: unable to extract title')
1407 video_uploader = mobj.group(1)
1412 'uploader': video_uploader,
1413 'upload_date': None,
1414 'title': video_title,
1415 'ext': video_extension,
1419 class YoutubeSearchIE(InfoExtractor):
1420 """Information Extractor for YouTube search queries."""
1421 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1422 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1423 _max_youtube_results = 1000
1424 IE_NAME = u'youtube:search'
1426 def __init__(self, downloader=None):
1427 InfoExtractor.__init__(self, downloader)
1429 def report_download_page(self, query, pagenum):
1430 """Report attempt to download search page with given number."""
1431 query = query.decode(preferredencoding())
1432 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1434 def _real_extract(self, query):
1435 mobj = re.match(self._VALID_URL, query)
1437 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1440 prefix, query = query.split(':')
1442 query = query.encode('utf-8')
1444 self._download_n_results(query, 1)
1446 elif prefix == 'all':
1447 self._download_n_results(query, self._max_youtube_results)
1453 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1455 elif n > self._max_youtube_results:
1456 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1457 n = self._max_youtube_results
1458 self._download_n_results(query, n)
1460 except ValueError: # parsing prefix as integer fails
1461 self._download_n_results(query, 1)
1464 def _download_n_results(self, query, n):
1465 """Downloads a specified number of results for a query"""
1471 while (50 * pagenum) < limit:
1472 self.report_download_page(query, pagenum+1)
1473 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1474 request = compat_urllib_request.Request(result_url)
1476 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1478 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1480 api_response = json.loads(data)['data']
1482 new_ids = list(video['id'] for video in api_response['items'])
1483 video_ids += new_ids
1485 limit = min(n, api_response['totalItems'])
1488 if len(video_ids) > n:
1489 video_ids = video_ids[:n]
1490 for id in video_ids:
1491 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1495 class GoogleSearchIE(InfoExtractor):
1496 """Information Extractor for Google Video search queries."""
1497 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1498 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1499 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1500 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1501 _max_google_results = 1000
1502 IE_NAME = u'video.google:search'
1504 def __init__(self, downloader=None):
1505 InfoExtractor.__init__(self, downloader)
1507 def report_download_page(self, query, pagenum):
1508 """Report attempt to download playlist page with given number."""
1509 query = query.decode(preferredencoding())
1510 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1512 def _real_extract(self, query):
1513 mobj = re.match(self._VALID_URL, query)
1515 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1518 prefix, query = query.split(':')
1520 query = query.encode('utf-8')
1522 self._download_n_results(query, 1)
1524 elif prefix == 'all':
1525 self._download_n_results(query, self._max_google_results)
1531 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1533 elif n > self._max_google_results:
1534 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1535 n = self._max_google_results
1536 self._download_n_results(query, n)
1538 except ValueError: # parsing prefix as integer fails
1539 self._download_n_results(query, 1)
1542 def _download_n_results(self, query, n):
1543 """Downloads a specified number of results for a query"""
1549 self.report_download_page(query, pagenum)
1550 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1551 request = compat_urllib_request.Request(result_url)
1553 page = compat_urllib_request.urlopen(request).read()
1554 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1555 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1558 # Extract video identifiers
1559 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1560 video_id = mobj.group(1)
1561 if video_id not in video_ids:
1562 video_ids.append(video_id)
1563 if len(video_ids) == n:
1564 # Specified n videos reached
1565 for id in video_ids:
1566 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1569 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1570 for id in video_ids:
1571 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574 pagenum = pagenum + 1
1577 class YahooSearchIE(InfoExtractor):
1578 """Information Extractor for Yahoo! Video search queries."""
1581 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1582 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1583 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1584 _MORE_PAGES_INDICATOR = r'\s*Next'
1585 _max_yahoo_results = 1000
1586 IE_NAME = u'video.yahoo:search'
1588 def __init__(self, downloader=None):
1589 InfoExtractor.__init__(self, downloader)
1591 def report_download_page(self, query, pagenum):
1592 """Report attempt to download playlist page with given number."""
1593 query = query.decode(preferredencoding())
1594 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1596 def _real_extract(self, query):
1597 mobj = re.match(self._VALID_URL, query)
1599 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1602 prefix, query = query.split(':')
1604 query = query.encode('utf-8')
1606 self._download_n_results(query, 1)
1608 elif prefix == 'all':
1609 self._download_n_results(query, self._max_yahoo_results)
1615 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1617 elif n > self._max_yahoo_results:
1618 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1619 n = self._max_yahoo_results
1620 self._download_n_results(query, n)
1622 except ValueError: # parsing prefix as integer fails
1623 self._download_n_results(query, 1)
1626 def _download_n_results(self, query, n):
1627 """Downloads a specified number of results for a query"""
1630 already_seen = set()
1634 self.report_download_page(query, pagenum)
1635 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1636 request = compat_urllib_request.Request(result_url)
1638 page = compat_urllib_request.urlopen(request).read()
1639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1640 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1643 # Extract video identifiers
1644 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1645 video_id = mobj.group(1)
1646 if video_id not in already_seen:
1647 video_ids.append(video_id)
1648 already_seen.add(video_id)
1649 if len(video_ids) == n:
1650 # Specified n videos reached
1651 for id in video_ids:
1652 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1656 for id in video_ids:
1657 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660 pagenum = pagenum + 1
1663 class YoutubePlaylistIE(InfoExtractor):
1664 """Information Extractor for YouTube playlists."""
1666 _VALID_URL = r"""(?:
1671 (?:course|view_play_list|my_playlists|artist|playlist)
1679 ([0-9A-Za-z-_]{10,})
1680 (?:/.*?/([0-9A-Za-z_-]+))?
1682 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1684 IE_NAME = u'youtube:playlist'
1686 def __init__(self, downloader=None):
1687 InfoExtractor.__init__(self, downloader)
1689 def suitable(self, url):
1690 """Receives a URL and returns True if suitable for this IE."""
1691 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
1693 def report_download_page(self, playlist_id, pagenum):
1694 """Report attempt to download playlist page with given number."""
1695 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1697 def _real_extract(self, url):
1698 # Extract playlist id
1699 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1701 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1705 if mobj.group(3) is not None:
1706 self._downloader.download([mobj.group(3)])
1709 # Download playlist videos from API
1710 playlist_id = mobj.group(2)
1715 self.report_download_page(playlist_id, page_num)
1717 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1719 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1720 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1721 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725 response = json.loads(page)
1726 except ValueError as err:
1727 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1730 videos += [(entry['yt$position']['$t'], entry['content']['src']) for entry in response['feed']['entry']]
1732 if len(response['feed']['entry']) < self._MAX_RESULTS:
1736 videos = map(operator.itemgetter(1), sorted(videos))
1740 playliststart = self._downloader.params.get('playliststart', 1) - 1
1741 playlistend = self._downloader.params.get('playlistend', -1)
1742 if playlistend == -1:
1743 videos = videos[playliststart:]
1745 videos = videos[playliststart:playlistend]
1747 if len(videos) == total:
1748 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1750 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1752 for video in videos:
1753 self._downloader.download([video])
1757 class YoutubeChannelIE(InfoExtractor):
1758 """Information Extractor for YouTube channels."""
1760 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1761 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1762 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1763 IE_NAME = u'youtube:channel'
1765 def report_download_page(self, channel_id, pagenum):
1766 """Report attempt to download channel page with given number."""
1767 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1769 def _real_extract(self, url):
1770 # Extract channel id
1771 mobj = re.match(self._VALID_URL, url)
1773 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1776 # Download channel pages
1777 channel_id = mobj.group(1)
1782 self.report_download_page(channel_id, pagenum)
1783 url = self._TEMPLATE_URL % (channel_id, pagenum)
1784 request = compat_urllib_request.Request(url)
1786 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1791 # Extract video identifiers
1793 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1794 if mobj.group(1) not in ids_in_page:
1795 ids_in_page.append(mobj.group(1))
1796 video_ids.extend(ids_in_page)
1798 if self._MORE_PAGES_INDICATOR not in page:
1800 pagenum = pagenum + 1
1802 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1804 for id in video_ids:
1805 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1809 class YoutubeUserIE(InfoExtractor):
1810 """Information Extractor for YouTube users."""
1812 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1813 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1814 _GDATA_PAGE_SIZE = 50
1815 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1816 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1817 IE_NAME = u'youtube:user'
1819 def __init__(self, downloader=None):
1820 InfoExtractor.__init__(self, downloader)
1822 def report_download_page(self, username, start_index):
1823 """Report attempt to download user page."""
1824 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1825 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1827 def _real_extract(self, url):
1829 mobj = re.match(self._VALID_URL, url)
1831 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1834 username = mobj.group(1)
1836 # Download video ids using YouTube Data API. Result size per
1837 # query is limited (currently to 50 videos) so we need to query
1838 # page by page until there are no video ids - it means we got
1845 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1846 self.report_download_page(username, start_index)
1848 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1851 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1853 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1856 # Extract video identifiers
1859 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1860 if mobj.group(1) not in ids_in_page:
1861 ids_in_page.append(mobj.group(1))
1863 video_ids.extend(ids_in_page)
1865 # A little optimization - if current page is not
1866 # "full", ie. does not contain PAGE_SIZE video ids then
1867 # we can assume that this page is the last one - there
1868 # are no more ids on further pages - no need to query
1871 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1876 all_ids_count = len(video_ids)
1877 playliststart = self._downloader.params.get('playliststart', 1) - 1
1878 playlistend = self._downloader.params.get('playlistend', -1)
1880 if playlistend == -1:
1881 video_ids = video_ids[playliststart:]
1883 video_ids = video_ids[playliststart:playlistend]
1885 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1886 (username, all_ids_count, len(video_ids)))
1888 for video_id in video_ids:
1889 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1892 class BlipTVUserIE(InfoExtractor):
1893 """Information Extractor for blip.tv users."""
1895 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1897 IE_NAME = u'blip.tv:user'
1899 def __init__(self, downloader=None):
1900 InfoExtractor.__init__(self, downloader)
1902 def report_download_page(self, username, pagenum):
1903 """Report attempt to download user page."""
1904 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1905 (self.IE_NAME, username, pagenum))
1907 def _real_extract(self, url):
1909 mobj = re.match(self._VALID_URL, url)
1911 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1914 username = mobj.group(1)
1916 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1918 request = compat_urllib_request.Request(url)
1921 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1922 mobj = re.search(r'data-users-id="([^"]+)"', page)
1923 page_base = page_base % mobj.group(1)
1924 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1925 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1929 # Download video ids using BlipTV Ajax calls. Result size per
1930 # query is limited (currently to 12 videos) so we need to query
1931 # page by page until there are no video ids - it means we got
1938 self.report_download_page(username, pagenum)
1939 url = page_base + "&page=" + str(pagenum)
1940 request = compat_urllib_request.Request( url )
1942 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1943 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1947 # Extract video identifiers
1950 for mobj in re.finditer(r'href="/([^"]+)"', page):
1951 if mobj.group(1) not in ids_in_page:
1952 ids_in_page.append(unescapeHTML(mobj.group(1)))
1954 video_ids.extend(ids_in_page)
1956 # A little optimization - if current page is not
1957 # "full", ie. does not contain PAGE_SIZE video ids then
1958 # we can assume that this page is the last one - there
1959 # are no more ids on further pages - no need to query
1962 if len(ids_in_page) < self._PAGE_SIZE:
1967 all_ids_count = len(video_ids)
1968 playliststart = self._downloader.params.get('playliststart', 1) - 1
1969 playlistend = self._downloader.params.get('playlistend', -1)
1971 if playlistend == -1:
1972 video_ids = video_ids[playliststart:]
1974 video_ids = video_ids[playliststart:playlistend]
1976 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1977 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1979 for video_id in video_ids:
1980 self._downloader.download([u'http://blip.tv/'+video_id])
1983 class DepositFilesIE(InfoExtractor):
1984 """Information extractor for depositfiles.com"""
1986 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1988 def report_download_webpage(self, file_id):
1989 """Report webpage download."""
1990 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1992 def report_extraction(self, file_id):
1993 """Report information extraction."""
1994 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1996 def _real_extract(self, url):
1997 file_id = url.split('/')[-1]
1998 # Rebuild url in english locale
1999 url = 'http://depositfiles.com/en/files/' + file_id
2001 # Retrieve file webpage with 'Free download' button pressed
2002 free_download_indication = { 'gateway_result' : '1' }
2003 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2005 self.report_download_webpage(file_id)
2006 webpage = compat_urllib_request.urlopen(request).read()
2007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2011 # Search for the real file URL
2012 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2013 if (mobj is None) or (mobj.group(1) is None):
2014 # Try to figure out reason of the error.
2015 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2016 if (mobj is not None) and (mobj.group(1) is not None):
2017 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2018 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2020 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2023 file_url = mobj.group(1)
2024 file_extension = os.path.splitext(file_url)[1][1:]
2026 # Search for file title
2027 mobj = re.search(r'<b title="(.*?)">', webpage)
2029 self._downloader.trouble(u'ERROR: unable to extract title')
2031 file_title = mobj.group(1).decode('utf-8')
2034 'id': file_id.decode('utf-8'),
2035 'url': file_url.decode('utf-8'),
2037 'upload_date': None,
2038 'title': file_title,
2039 'ext': file_extension.decode('utf-8'),
2043 class FacebookIE(InfoExtractor):
2044 """Information Extractor for Facebook"""
2046 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2047 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2048 _NETRC_MACHINE = 'facebook'
2049 IE_NAME = u'facebook'
2051 def report_login(self):
2052 """Report attempt to log in."""
2053 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2055 def _real_initialize(self):
2056 if self._downloader is None:
2061 downloader_params = self._downloader.params
2063 # Attempt to use provided username and password or .netrc data
2064 if downloader_params.get('username', None) is not None:
2065 useremail = downloader_params['username']
2066 password = downloader_params['password']
2067 elif downloader_params.get('usenetrc', False):
2069 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2070 if info is not None:
2074 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2075 except (IOError, netrc.NetrcParseError) as err:
2076 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2079 if useremail is None:
2088 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2091 login_results = compat_urllib_request.urlopen(request).read()
2092 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2093 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2095 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2096 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2099 def _real_extract(self, url):
2100 mobj = re.match(self._VALID_URL, url)
2102 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2104 video_id = mobj.group('ID')
2106 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2107 webpage = self._download_webpage(url, video_id)
2109 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2110 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2111 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2113 raise ExtractorError(u'Cannot parse data')
2114 data = dict(json.loads(m.group(1)))
2115 params_raw = compat_urllib_parse.unquote(data['params'])
2116 params = json.loads(params_raw)
2117 video_url = params['hd_src']
2119 video_url = params['sd_src']
2121 raise ExtractorError(u'Cannot find video URL')
2122 video_duration = int(params['video_duration'])
2124 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2126 raise ExtractorError(u'Cannot find title in webpage')
2127 video_title = unescapeHTML(m.group(1))
2131 'title': video_title,
2134 'duration': video_duration,
2135 'thumbnail': params['thumbnail_src'],
2140 class BlipTVIE(InfoExtractor):
2141 """Information extractor for blip.tv"""
2143 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2144 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2145 IE_NAME = u'blip.tv'
2147 def report_extraction(self, file_id):
2148 """Report information extraction."""
2149 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2151 def report_direct_download(self, title):
2152 """Report information extraction."""
2153 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2155 def _real_extract(self, url):
2156 mobj = re.match(self._VALID_URL, url)
2158 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2165 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2166 request = compat_urllib_request.Request(json_url)
2167 request.add_header('User-Agent', 'iTunes/10.6.1')
2168 self.report_extraction(mobj.group(1))
2171 urlh = compat_urllib_request.urlopen(request)
2172 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2173 basename = url.split('/')[-1]
2174 title,ext = os.path.splitext(basename)
2175 title = title.decode('UTF-8')
2176 ext = ext.replace('.', '')
2177 self.report_direct_download(title)
2182 'upload_date': None,
2187 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2188 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2189 if info is None: # Regular URL
2191 json_code_bytes = urlh.read()
2192 json_code = json_code_bytes.decode('utf-8')
2193 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2194 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2198 json_data = json.loads(json_code)
2199 if 'Post' in json_data:
2200 data = json_data['Post']
2204 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2205 video_url = data['media']['url']
2206 umobj = re.match(self._URL_EXT, video_url)
2208 raise ValueError('Can not determine filename extension')
2209 ext = umobj.group(1)
2212 'id': data['item_id'],
2214 'uploader': data['display_name'],
2215 'upload_date': upload_date,
2216 'title': data['title'],
2218 'format': data['media']['mimeType'],
2219 'thumbnail': data['thumbnailUrl'],
2220 'description': data['description'],
2221 'player_url': data['embedUrl'],
2222 'user_agent': 'iTunes/10.6.1',
2224 except (ValueError,KeyError) as err:
2225 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2231 class MyVideoIE(InfoExtractor):
2232 """Information Extractor for myvideo.de."""
2234 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2235 IE_NAME = u'myvideo'
2237 def __init__(self, downloader=None):
2238 InfoExtractor.__init__(self, downloader)
2240 def report_extraction(self, video_id):
2241 """Report information extraction."""
2242 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2244 def _real_extract(self,url):
2245 mobj = re.match(self._VALID_URL, url)
2247 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2250 video_id = mobj.group(1)
2253 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2254 webpage = self._download_webpage(webpage_url, video_id)
2256 self.report_extraction(video_id)
2257 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2260 self._downloader.trouble(u'ERROR: unable to extract media URL')
2262 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2264 mobj = re.search('<title>([^<]+)</title>', webpage)
2266 self._downloader.trouble(u'ERROR: unable to extract title')
2269 video_title = mobj.group(1)
2275 'upload_date': None,
2276 'title': video_title,
2280 class ComedyCentralIE(InfoExtractor):
2281 """Information extractor for The Daily Show and Colbert Report """
2283 # urls can be abbreviations like :thedailyshow or :colbert
2284 # urls for episodes like:
2285 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2286 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2287 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2288 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2289 |(https?://)?(www\.)?
2290 (?P<showname>thedailyshow|colbertnation)\.com/
2291 (full-episodes/(?P<episode>.*)|
2293 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2294 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2297 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2299 _video_extensions = {
2307 _video_dimensions = {
2316 def suitable(self, url):
2317 """Receives a URL and returns True if suitable for this IE."""
2318 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2320 def report_extraction(self, episode_id):
2321 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2323 def report_config_download(self, episode_id, media_id):
2324 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2326 def report_index_download(self, episode_id):
2327 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2329 def _print_formats(self, formats):
2330 print('Available formats:')
2332 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2335 def _real_extract(self, url):
2336 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2338 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2341 if mobj.group('shortname'):
2342 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2343 url = u'http://www.thedailyshow.com/full-episodes/'
2345 url = u'http://www.colbertnation.com/full-episodes/'
2346 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2347 assert mobj is not None
2349 if mobj.group('clip'):
2350 if mobj.group('showname') == 'thedailyshow':
2351 epTitle = mobj.group('tdstitle')
2353 epTitle = mobj.group('cntitle')
2356 dlNewest = not mobj.group('episode')
2358 epTitle = mobj.group('showname')
2360 epTitle = mobj.group('episode')
2362 req = compat_urllib_request.Request(url)
2363 self.report_extraction(epTitle)
2365 htmlHandle = compat_urllib_request.urlopen(req)
2366 html = htmlHandle.read()
2367 webpage = html.decode('utf-8')
2368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2369 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2372 url = htmlHandle.geturl()
2373 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2375 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2377 if mobj.group('episode') == '':
2378 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2380 epTitle = mobj.group('episode')
2382 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2384 if len(mMovieParams) == 0:
2385 # The Colbert Report embeds the information in a without
2386 # a URL prefix; so extract the alternate reference
2387 # and then add the URL prefix manually.
2389 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2390 if len(altMovieParams) == 0:
2391 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2394 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2396 uri = mMovieParams[0][1]
2397 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2398 self.report_index_download(epTitle)
2400 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2407 idoc = xml.etree.ElementTree.fromstring(indexXml)
2408 itemEls = idoc.findall('.//item')
2409 for partNum,itemEl in enumerate(itemEls):
2410 mediaId = itemEl.findall('./guid')[0].text
2411 shortMediaId = mediaId.split(':')[-1]
2412 showId = mediaId.split(':')[-2].replace('.com', '')
2413 officialTitle = itemEl.findall('./title')[0].text
2414 officialDate = itemEl.findall('./pubDate')[0].text
2416 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2417 compat_urllib_parse.urlencode({'uri': mediaId}))
2418 configReq = compat_urllib_request.Request(configUrl)
2419 self.report_config_download(epTitle, shortMediaId)
2421 configXml = compat_urllib_request.urlopen(configReq).read()
2422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2426 cdoc = xml.etree.ElementTree.fromstring(configXml)
2428 for rendition in cdoc.findall('.//rendition'):
2429 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2433 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2436 if self._downloader.params.get('listformats', None):
2437 self._print_formats([i[0] for i in turls])
2440 # For now, just pick the highest bitrate
2441 format,rtmp_video_url = turls[-1]
2443 # Get the format arg from the arg stream
2444 req_format = self._downloader.params.get('format', None)
2446 # Select format if we can find one
2449 format, rtmp_video_url = f, v
2452 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2454 raise ExtractorError(u'Cannot transform RTMP url')
2455 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2456 video_url = base + m.group('finalid')
2458 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2463 'upload_date': officialDate,
2468 'description': officialTitle,
2470 results.append(info)
2475 class EscapistIE(InfoExtractor):
2476 """Information extractor for The Escapist """
2478 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2479 IE_NAME = u'escapist'
2481 def report_extraction(self, showName):
2482 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2484 def report_config_download(self, showName):
2485 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2487 def _real_extract(self, url):
2488 mobj = re.match(self._VALID_URL, url)
2490 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2492 showName = mobj.group('showname')
2493 videoId = mobj.group('episode')
2495 self.report_extraction(showName)
2497 webPage = compat_urllib_request.urlopen(url)
2498 webPageBytes = webPage.read()
2499 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2500 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2501 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2502 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2505 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2506 description = unescapeHTML(descMatch.group(1))
2507 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2508 imgUrl = unescapeHTML(imgMatch.group(1))
2509 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2510 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2511 configUrlMatch = re.search('config=(.*)$', playerUrl)
2512 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2514 self.report_config_download(showName)
2516 configJSON = compat_urllib_request.urlopen(configUrl)
2517 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2518 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2519 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2520 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2523 # Technically, it's JavaScript, not JSON
2524 configJSON = configJSON.replace("'", '"')
2527 config = json.loads(configJSON)
2528 except (ValueError,) as err:
2529 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2532 playlist = config['playlist']
2533 videoUrl = playlist[1]['url']
2538 'uploader': showName,
2539 'upload_date': None,
2542 'thumbnail': imgUrl,
2543 'description': description,
2544 'player_url': playerUrl,
2549 class CollegeHumorIE(InfoExtractor):
2550 """Information extractor for collegehumor.com"""
2553 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2554 IE_NAME = u'collegehumor'
2556 def report_manifest(self, video_id):
2557 """Report information extraction."""
2558 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2560 def report_extraction(self, video_id):
2561 """Report information extraction."""
2562 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2564 def _real_extract(self, url):
2565 mobj = re.match(self._VALID_URL, url)
2567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2569 video_id = mobj.group('videoid')
2574 'upload_date': None,
2577 self.report_extraction(video_id)
2578 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2580 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2582 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2585 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2587 videoNode = mdoc.findall('./video')[0]
2588 info['description'] = videoNode.findall('./description')[0].text
2589 info['title'] = videoNode.findall('./caption')[0].text
2590 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2591 manifest_url = videoNode.findall('./file')[0].text
2593 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2596 manifest_url += '?hdcore=2.10.3'
2597 self.report_manifest(video_id)
2599 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2600 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2601 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2604 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2606 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2607 node_id = media_node.attrib['url']
2608 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2609 except IndexError as err:
2610 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2613 url_pr = compat_urllib_parse_urlparse(manifest_url)
2614 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2621 class XVideosIE(InfoExtractor):
2622 """Information extractor for xvideos.com"""
2624 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2625 IE_NAME = u'xvideos'
2627 def report_extraction(self, video_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2631 def _real_extract(self, url):
2632 mobj = re.match(self._VALID_URL, url)
2634 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2636 video_id = mobj.group(1)
2638 webpage = self._download_webpage(url, video_id)
2640 self.report_extraction(video_id)
2644 mobj = re.search(r'flv_url=(.+?)&', webpage)
2646 self._downloader.trouble(u'ERROR: unable to extract video url')
2648 video_url = compat_urllib_parse.unquote(mobj.group(1))
2652 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2654 self._downloader.trouble(u'ERROR: unable to extract video title')
2656 video_title = mobj.group(1)
2659 # Extract video thumbnail
2660 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2662 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2664 video_thumbnail = mobj.group(0)
2670 'upload_date': None,
2671 'title': video_title,
2673 'thumbnail': video_thumbnail,
2674 'description': None,
2680 class SoundcloudIE(InfoExtractor):
2681 """Information extractor for soundcloud.com
2682 To access the media, the uid of the song and a stream token
2683 must be extracted from the page source and the script must make
2684 a request to media.soundcloud.com/crossdomain.xml. Then
2685 the media can be grabbed by requesting from an url composed
2686 of the stream token and uid
2689 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2690 IE_NAME = u'soundcloud'
2692 def __init__(self, downloader=None):
2693 InfoExtractor.__init__(self, downloader)
2695 def report_resolve(self, video_id):
2696 """Report information extraction."""
2697 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2699 def report_extraction(self, video_id):
2700 """Report information extraction."""
2701 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2703 def _real_extract(self, url):
2704 mobj = re.match(self._VALID_URL, url)
2706 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2709 # extract uploader (which is in the url)
2710 uploader = mobj.group(1)
2711 # extract simple title (uploader + slug of song title)
2712 slug_title = mobj.group(2)
2713 simple_title = uploader + u'-' + slug_title
2715 self.report_resolve('%s/%s' % (uploader, slug_title))
2717 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2718 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2719 request = compat_urllib_request.Request(resolv_url)
2721 info_json_bytes = compat_urllib_request.urlopen(request).read()
2722 info_json = info_json_bytes.decode('utf-8')
2723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2724 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2727 info = json.loads(info_json)
2728 video_id = info['id']
2729 self.report_extraction('%s/%s' % (uploader, slug_title))
2731 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2732 request = compat_urllib_request.Request(streams_url)
2734 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2735 stream_json = stream_json_bytes.decode('utf-8')
2736 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2737 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2740 streams = json.loads(stream_json)
2741 mediaURL = streams['http_mp3_128_url']
2746 'uploader': info['user']['username'],
2747 'upload_date': info['created_at'],
2748 'title': info['title'],
2750 'description': info['description'],
2754 class InfoQIE(InfoExtractor):
2755 """Information extractor for infoq.com"""
2756 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2758 def report_extraction(self, video_id):
2759 """Report information extraction."""
2760 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2762 def _real_extract(self, url):
2763 mobj = re.match(self._VALID_URL, url)
2765 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2768 webpage = self._download_webpage(url, video_id=url)
2769 self.report_extraction(url)
2772 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2774 self._downloader.trouble(u'ERROR: unable to extract video url')
2776 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2777 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2780 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2782 self._downloader.trouble(u'ERROR: unable to extract video title')
2784 video_title = mobj.group(1)
2786 # Extract description
2787 video_description = u'No description available.'
2788 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2789 if mobj is not None:
2790 video_description = mobj.group(1)
2792 video_filename = video_url.split('/')[-1]
2793 video_id, extension = video_filename.split('.')
2799 'upload_date': None,
2800 'title': video_title,
2801 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2803 'description': video_description,
2808 class MixcloudIE(InfoExtractor):
2809 """Information extractor for www.mixcloud.com"""
2811 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2812 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2813 IE_NAME = u'mixcloud'
2815 def __init__(self, downloader=None):
2816 InfoExtractor.__init__(self, downloader)
2818 def report_download_json(self, file_id):
2819 """Report JSON download."""
2820 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2822 def report_extraction(self, file_id):
2823 """Report information extraction."""
2824 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2826 def get_urls(self, jsonData, fmt, bitrate='best'):
2827 """Get urls from 'audio_formats' section in json"""
2830 bitrate_list = jsonData[fmt]
2831 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2832 bitrate = max(bitrate_list) # select highest
2834 url_list = jsonData[fmt][bitrate]
2835 except TypeError: # we have no bitrate info.
2836 url_list = jsonData[fmt]
2839 def check_urls(self, url_list):
2840 """Returns 1st active url from list"""
2841 for url in url_list:
2843 compat_urllib_request.urlopen(url)
2845 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2850 def _print_formats(self, formats):
2851 print('Available formats:')
2852 for fmt in formats.keys():
2853 for b in formats[fmt]:
2855 ext = formats[fmt][b][0]
2856 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2857 except TypeError: # we have no bitrate info
2858 ext = formats[fmt][0]
2859 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2862 def _real_extract(self, url):
2863 mobj = re.match(self._VALID_URL, url)
2865 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2867 # extract uploader & filename from url
2868 uploader = mobj.group(1).decode('utf-8')
2869 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2871 # construct API request
2872 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2873 # retrieve .json file with links to files
2874 request = compat_urllib_request.Request(file_url)
2876 self.report_download_json(file_url)
2877 jsonData = compat_urllib_request.urlopen(request).read()
2878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2879 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2883 json_data = json.loads(jsonData)
2884 player_url = json_data['player_swf_url']
2885 formats = dict(json_data['audio_formats'])
2887 req_format = self._downloader.params.get('format', None)
2890 if self._downloader.params.get('listformats', None):
2891 self._print_formats(formats)
2894 if req_format is None or req_format == 'best':
2895 for format_param in formats.keys():
2896 url_list = self.get_urls(formats, format_param)
2898 file_url = self.check_urls(url_list)
2899 if file_url is not None:
2902 if req_format not in formats:
2903 self._downloader.trouble(u'ERROR: format is not available')
2906 url_list = self.get_urls(formats, req_format)
2907 file_url = self.check_urls(url_list)
2908 format_param = req_format
2911 'id': file_id.decode('utf-8'),
2912 'url': file_url.decode('utf-8'),
2913 'uploader': uploader.decode('utf-8'),
2914 'upload_date': None,
2915 'title': json_data['name'],
2916 'ext': file_url.split('.')[-1].decode('utf-8'),
2917 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2918 'thumbnail': json_data['thumbnail_url'],
2919 'description': json_data['description'],
2920 'player_url': player_url.decode('utf-8'),
2923 class StanfordOpenClassroomIE(InfoExtractor):
2924 """Information extractor for Stanford's Open ClassRoom"""
2926 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2927 IE_NAME = u'stanfordoc'
2929 def report_download_webpage(self, objid):
2930 """Report information extraction."""
2931 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2933 def report_extraction(self, video_id):
2934 """Report information extraction."""
2935 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2937 def _real_extract(self, url):
2938 mobj = re.match(self._VALID_URL, url)
2940 raise ExtractorError(u'Invalid URL: %s' % url)
2942 if mobj.group('course') and mobj.group('video'): # A specific video
2943 course = mobj.group('course')
2944 video = mobj.group('video')
2946 'id': course + '_' + video,
2948 'upload_date': None,
2951 self.report_extraction(info['id'])
2952 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2953 xmlUrl = baseUrl + video + '.xml'
2955 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2956 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2957 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2959 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2961 info['title'] = mdoc.findall('./title')[0].text
2962 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2964 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2966 info['ext'] = info['url'].rpartition('.')[2]
2968 elif mobj.group('course'): # A course page
2969 course = mobj.group('course')
2974 'upload_date': None,
2977 coursepage = self._download_webpage(url, info['id'],
2978 note='Downloading course info page',
2979 errnote='Unable to download course info page')
2981 m = re.search('<h1>([^<]+)</h1>', coursepage)
2983 info['title'] = unescapeHTML(m.group(1))
2985 info['title'] = info['id']
2987 m = re.search('<description>([^<]+)</description>', coursepage)
2989 info['description'] = unescapeHTML(m.group(1))
2991 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2994 'type': 'reference',
2995 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2999 for entry in info['list']:
3000 assert entry['type'] == 'reference'
3001 results += self.extract(entry['url'])
3005 'id': 'Stanford OpenClassroom',
3008 'upload_date': None,
3011 self.report_download_webpage(info['id'])
3012 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3014 rootpage = compat_urllib_request.urlopen(rootURL).read()
3015 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3016 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3019 info['title'] = info['id']
3021 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3024 'type': 'reference',
3025 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3030 for entry in info['list']:
3031 assert entry['type'] == 'reference'
3032 results += self.extract(entry['url'])
3035 class MTVIE(InfoExtractor):
3036 """Information extractor for MTV.com"""
3038 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3041 def report_extraction(self, video_id):
3042 """Report information extraction."""
3043 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3045 def _real_extract(self, url):
3046 mobj = re.match(self._VALID_URL, url)
3048 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3050 if not mobj.group('proto'):
3051 url = 'http://' + url
3052 video_id = mobj.group('videoid')
3054 webpage = self._download_webpage(url, video_id)
3056 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3058 self._downloader.trouble(u'ERROR: unable to extract song name')
3060 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3061 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3063 self._downloader.trouble(u'ERROR: unable to extract performer')
3065 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 video_title = performer + ' - ' + song_name
3068 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3070 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3072 mtvn_uri = mobj.group(1)
3074 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3076 self._downloader.trouble(u'ERROR: unable to extract content id')
3078 content_id = mobj.group(1)
3080 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3081 self.report_extraction(video_id)
3082 request = compat_urllib_request.Request(videogen_url)
3084 metadataXml = compat_urllib_request.urlopen(request).read()
3085 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3086 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3089 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3090 renditions = mdoc.findall('.//rendition')
3092 # For now, always pick the highest quality.
3093 rendition = renditions[-1]
3096 _,_,ext = rendition.attrib['type'].partition('/')
3097 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3098 video_url = rendition.find('./src').text
3100 self._downloader.trouble('Invalid rendition field.')
3106 'uploader': performer,
3107 'upload_date': None,
3108 'title': video_title,
3116 class YoukuIE(InfoExtractor):
3117 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3119 def report_download_webpage(self, file_id):
3120 """Report webpage download."""
3121 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3123 def report_extraction(self, file_id):
3124 """Report information extraction."""
3125 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3128 nowTime = int(time.time() * 1000)
3129 random1 = random.randint(1000,1998)
3130 random2 = random.randint(1000,9999)
3132 return "%d%d%d" %(nowTime,random1,random2)
3134 def _get_file_ID_mix_string(self, seed):
3136 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3138 for i in range(len(source)):
3139 seed = (seed * 211 + 30031 ) % 65536
3140 index = math.floor(seed / 65536 * len(source) )
3141 mixed.append(source[int(index)])
3142 source.remove(source[int(index)])
3143 #return ''.join(mixed)
3146 def _get_file_id(self, fileId, seed):
3147 mixed = self._get_file_ID_mix_string(seed)
3148 ids = fileId.split('*')
3152 realId.append(mixed[int(ch)])
3153 return ''.join(realId)
3155 def _real_extract(self, url):
3156 mobj = re.match(self._VALID_URL, url)
3158 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3160 video_id = mobj.group('ID')
3162 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3164 request = compat_urllib_request.Request(info_url, None, std_headers)
3166 self.report_download_webpage(video_id)
3167 jsondata = compat_urllib_request.urlopen(request).read()
3168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3172 self.report_extraction(video_id)
3174 jsonstr = jsondata.decode('utf-8')
3175 config = json.loads(jsonstr)
3177 video_title = config['data'][0]['title']
3178 seed = config['data'][0]['seed']
3180 format = self._downloader.params.get('format', None)
3181 supported_format = list(config['data'][0]['streamfileids'].keys())
3183 if format is None or format == 'best':
3184 if 'hd2' in supported_format:
3189 elif format == 'worst':
3197 fileid = config['data'][0]['streamfileids'][format]
3198 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3199 except (UnicodeDecodeError, ValueError, KeyError):
3200 self._downloader.trouble(u'ERROR: unable to extract info section')
3204 sid = self._gen_sid()
3205 fileid = self._get_file_id(fileid, seed)
3207 #column 8,9 of fileid represent the segment number
3208 #fileid[7:9] should be changed
3209 for index, key in enumerate(keys):
3211 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3212 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3215 'id': '%s_part%02d' % (video_id, index),
3216 'url': download_url,
3218 'upload_date': None,
3219 'title': video_title,
3222 files_info.append(info)
3227 class XNXXIE(InfoExtractor):
3228 """Information extractor for xnxx.com"""
3230 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3232 VIDEO_URL_RE = r'flv_url=(.*?)&'
3233 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3234 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3236 def report_webpage(self, video_id):
3237 """Report information extraction"""
3238 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3240 def report_extraction(self, video_id):
3241 """Report information extraction"""
3242 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3244 def _real_extract(self, url):
3245 mobj = re.match(self._VALID_URL, url)
3247 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3249 video_id = mobj.group(1)
3251 self.report_webpage(video_id)
3253 # Get webpage content
3255 webpage_bytes = compat_urllib_request.urlopen(url).read()
3256 webpage = webpage_bytes.decode('utf-8')
3257 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3258 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3261 result = re.search(self.VIDEO_URL_RE, webpage)
3263 self._downloader.trouble(u'ERROR: unable to extract video url')
3265 video_url = compat_urllib_parse.unquote(result.group(1))
3267 result = re.search(self.VIDEO_TITLE_RE, webpage)
3269 self._downloader.trouble(u'ERROR: unable to extract video title')
3271 video_title = result.group(1)
3273 result = re.search(self.VIDEO_THUMB_RE, webpage)
3275 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3277 video_thumbnail = result.group(1)
3283 'upload_date': None,
3284 'title': video_title,
3286 'thumbnail': video_thumbnail,
3287 'description': None,
3291 class GooglePlusIE(InfoExtractor):
3292 """Information extractor for plus.google.com."""
3294 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3295 IE_NAME = u'plus.google'
3297 def __init__(self, downloader=None):
3298 InfoExtractor.__init__(self, downloader)
3300 def report_extract_entry(self, url):
3301 """Report downloading extry"""
3302 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3304 def report_date(self, upload_date):
3305 """Report downloading extry"""
3306 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3308 def report_uploader(self, uploader):
3309 """Report downloading extry"""
3310 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3312 def report_title(self, video_title):
3313 """Report downloading extry"""
3314 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3316 def report_extract_vid_page(self, video_page):
3317 """Report information extraction."""
3318 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3320 def _real_extract(self, url):
3321 # Extract id from URL
3322 mobj = re.match(self._VALID_URL, url)
3324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3327 post_url = mobj.group(0)
3328 video_id = mobj.group(1)
3330 video_extension = 'flv'
3332 # Step 1, Retrieve post webpage to extract further information
3333 self.report_extract_entry(post_url)
3334 request = compat_urllib_request.Request(post_url)
3336 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3337 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3338 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3341 # Extract update date
3343 pattern = 'title="Timestamp">(.*?)</a>'
3344 mobj = re.search(pattern, webpage)
3346 upload_date = mobj.group(1)
3347 # Convert timestring to a format suitable for filename
3348 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3349 upload_date = upload_date.strftime('%Y%m%d')
3350 self.report_date(upload_date)
3354 pattern = r'rel\="author".*?>(.*?)</a>'
3355 mobj = re.search(pattern, webpage)
3357 uploader = mobj.group(1)
3358 self.report_uploader(uploader)
3361 # Get the first line for title
3363 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3364 mobj = re.search(pattern, webpage)
3366 video_title = mobj.group(1)
3367 self.report_title(video_title)
3369 # Step 2, Stimulate clicking the image box to launch video
3370 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3371 mobj = re.search(pattern, webpage)
3373 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3375 video_page = mobj.group(1)
3376 request = compat_urllib_request.Request(video_page)
3378 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3379 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3380 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3382 self.report_extract_vid_page(video_page)
3385 # Extract video links on video page
3386 """Extract video links of all sizes"""
3387 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3388 mobj = re.findall(pattern, webpage)
3390 self._downloader.trouble(u'ERROR: unable to extract video links')
3392 # Sort in resolution
3393 links = sorted(mobj)
3395 # Choose the lowest of the sort, i.e. highest resolution
3396 video_url = links[-1]
3397 # Only get the url. The resolution part in the tuple has no use anymore
3398 video_url = video_url[-1]
3399 # Treat escaped \u0026 style hex
3401 video_url = video_url.decode("unicode_escape")
3402 except AttributeError: # Python 3
3403 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3409 'uploader': uploader,
3410 'upload_date': upload_date,
3411 'title': video_title,
3412 'ext': video_extension,
3415 class NBAIE(InfoExtractor):
3416 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3419 def _real_extract(self, url):
3420 mobj = re.match(self._VALID_URL, url)
3422 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3425 video_id = mobj.group(1)
3426 if video_id.endswith('/index.html'):
3427 video_id = video_id[:-len('/index.html')]
3429 webpage = self._download_webpage(url, video_id)
3431 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3432 def _findProp(rexp, default=None):
3433 m = re.search(rexp, webpage)
3435 return unescapeHTML(m.group(1))
3439 shortened_video_id = video_id.rpartition('/')[2]
3440 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3442 'id': shortened_video_id,
3446 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3447 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3451 class JustinTVIE(InfoExtractor):
3452 """Information extractor for justin.tv and twitch.tv"""
3453 # TODO: One broadcast may be split into multiple videos. The key
3454 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3455 # starts at 1 and increases. Can we treat all parts as one video?
3457 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3458 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3459 _JUSTIN_PAGE_LIMIT = 100
3460 IE_NAME = u'justin.tv'
3462 def report_extraction(self, file_id):
3463 """Report information extraction."""
3464 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3466 def report_download_page(self, channel, offset):
3467 """Report attempt to download a single page of videos."""
3468 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3469 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3471 # Return count of items, list of *valid* items
3472 def _parse_page(self, url):
3474 urlh = compat_urllib_request.urlopen(url)
3475 webpage_bytes = urlh.read()
3476 webpage = webpage_bytes.decode('utf-8', 'ignore')
3477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3478 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3481 response = json.loads(webpage)
3482 if type(response) != list:
3483 error_text = response.get('error', 'unknown error')
3484 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3487 for clip in response:
3488 video_url = clip['video_file_url']
3490 video_extension = os.path.splitext(video_url)[1][1:]
3491 video_date = re.sub('-', '', clip['start_time'][:10])
3492 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3493 video_id = clip['id']
3494 video_title = clip.get('title', video_id)
3498 'title': video_title,
3499 'uploader': clip.get('channel_name', video_uploader_id),
3500 'uploader_id': video_uploader_id,
3501 'upload_date': video_date,
3502 'ext': video_extension,
3504 return (len(response), info)
3506 def _real_extract(self, url):
3507 mobj = re.match(self._VALID_URL, url)
3509 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3512 api = 'http://api.justin.tv'
3513 video_id = mobj.group(mobj.lastindex)
3515 if mobj.lastindex == 1:
3517 api += '/channel/archives/%s.json'
3519 api += '/broadcast/by_archive/%s.json'
3520 api = api % (video_id,)
3522 self.report_extraction(video_id)
3526 limit = self._JUSTIN_PAGE_LIMIT
3529 self.report_download_page(video_id, offset)
3530 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3531 page_count, page_info = self._parse_page(page_url)
3532 info.extend(page_info)
3533 if not paged or page_count != limit:
3538 class FunnyOrDieIE(InfoExtractor):
3539 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3541 def _real_extract(self, url):
3542 mobj = re.match(self._VALID_URL, url)
3544 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3547 video_id = mobj.group('id')
3548 webpage = self._download_webpage(url, video_id)
3550 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3552 self._downloader.trouble(u'ERROR: unable to find video information')
3553 video_url = unescapeHTML(m.group('url'))
3555 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3557 self._downloader.trouble(u'Cannot find video title')
3558 title = unescapeHTML(m.group('title'))
3560 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3562 desc = unescapeHTML(m.group('desc'))
3571 'description': desc,
3575 class TweetReelIE(InfoExtractor):
3576 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3578 def _real_extract(self, url):
3579 mobj = re.match(self._VALID_URL, url)
3581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3584 video_id = mobj.group('id')
3585 webpage = self._download_webpage(url, video_id)
3587 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3589 self._downloader.trouble(u'ERROR: Cannot find status ID')
3590 status_id = m.group(1)
3592 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3594 self._downloader.trouble(u'WARNING: Cannot find description')
3595 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3597 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3599 self._downloader.trouble(u'ERROR: Cannot find uploader')
3600 uploader = unescapeHTML(m.group('uploader'))
3601 uploader_id = unescapeHTML(m.group('uploader_id'))
3603 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3605 self._downloader.trouble(u'ERROR: Cannot find upload date')
3606 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3609 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3616 'description': desc,
3617 'uploader': uploader,
3618 'uploader_id': uploader_id,
3619 'internal_id': status_id,
3620 'upload_date': upload_date
3624 class SteamIE(InfoExtractor):
3625 _VALID_URL = r"""http://store.steampowered.com/
3626 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3628 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3631 def suitable(self, url):
3632 """Receives a URL and returns True if suitable for this IE."""
3633 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3635 def _real_extract(self, url):
3636 m = re.match(self._VALID_URL, url, re.VERBOSE)
3637 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3638 gameID = m.group('gameID')
3639 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3640 webpage = self._download_webpage(videourl, gameID)
3641 mweb = re.finditer(urlRE, webpage)
3642 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3643 titles = re.finditer(namesRE, webpage)
3644 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3645 thumbs = re.finditer(thumbsRE, webpage)
3647 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3648 video_id = vid.group('videoID')
3649 title = vtitle.group('videoName')
3650 video_url = vid.group('videoURL')
3651 video_thumb = thumb.group('thumbnail')
3653 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3658 'title': unescapeHTML(title),
3659 'thumbnail': video_thumb
3664 class UstreamIE(InfoExtractor):
3665 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3666 IE_NAME = u'ustream'
3668 def _real_extract(self, url):
3669 m = re.match(self._VALID_URL, url)
3670 video_id = m.group('videoID')
3671 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3672 webpage = self._download_webpage(url, video_id)
3673 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3674 title = m.group('title')
3675 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3676 uploader = m.group('uploader')
3682 'uploader': uploader
3686 class RBMARadioIE(InfoExtractor):
3687 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3689 def _real_extract(self, url):
3690 m = re.match(self._VALID_URL, url)
3691 video_id = m.group('videoID')
3693 webpage = self._download_webpage(url, video_id)
3694 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3696 raise ExtractorError(u'Cannot find metadata')
3697 json_data = m.group(1)
3700 data = json.loads(json_data)
3701 except ValueError as e:
3702 raise ExtractorError(u'Invalid JSON: ' + str(e))
3704 video_url = data['akamai_url'] + '&cbr=256'
3705 url_parts = compat_urllib_parse_urlparse(video_url)
3706 video_ext = url_parts.path.rpartition('.')[2]
3711 'title': data['title'],
3712 'description': data.get('teaser_text'),
3713 'location': data.get('country_of_origin'),
3714 'uploader': data.get('host', {}).get('name'),
3715 'uploader_id': data.get('host', {}).get('slug'),
3716 'thumbnail': data.get('image', {}).get('large_url_2x'),
3717 'duration': data.get('duration'),
3722 class YouPornIE(InfoExtractor):
3723 """Information extractor for youporn.com."""
3724 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3726 def _print_formats(self, formats):
3727 """Print all available formats"""
3728 print(u'Available formats:')
3729 print(u'ext\t\tformat')
3730 print(u'---------------------------------')
3731 for format in formats:
3732 print(u'%s\t\t%s' % (format['ext'], format['format']))
3734 def _specific(self, req_format, formats):
3736 if(x["format"]==req_format):
3740 def _real_extract(self, url):
3741 mobj = re.match(self._VALID_URL, url)
3743 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3746 video_id = mobj.group('videoid')
3748 req = compat_urllib_request.Request(url)
3749 req.add_header('Cookie', 'age_verified=1')
3750 webpage = self._download_webpage(req, video_id)
3752 # Get the video title
3753 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3755 raise ExtractorError(u'Unable to extract video title')
3756 video_title = result.group('title').strip()
3758 # Get the video date
3759 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3761 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3764 upload_date = result.group('date').strip()
3766 # Get the video uploader
3767 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3769 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3770 video_uploader = None
3772 video_uploader = result.group('uploader').strip()
3773 video_uploader = clean_html( video_uploader )
3775 # Get all of the formats available
3776 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3777 result = re.search(DOWNLOAD_LIST_RE, webpage)
3779 raise ExtractorError(u'Unable to extract download list')
3780 download_list_html = result.group('download_list').strip()
3782 # Get all of the links from the page
3783 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3784 links = re.findall(LINK_RE, download_list_html)
3785 if(len(links) == 0):
3786 raise ExtractorError(u'ERROR: no known formats available for video')
3788 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3793 # A link looks like this:
3794 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3795 # A path looks like this:
3796 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3797 video_url = unescapeHTML( link )
3798 path = compat_urllib_parse_urlparse( video_url ).path
3799 extension = os.path.splitext( path )[1][1:]
3800 format = path.split('/')[4].split('_')[:2]
3803 format = "-".join( format )
3804 title = u'%s-%s-%s' % (video_title, size, bitrate)
3809 'uploader': video_uploader,
3810 'upload_date': upload_date,
3815 'description': None,
3819 if self._downloader.params.get('listformats', None):
3820 self._print_formats(formats)
3823 req_format = self._downloader.params.get('format', None)
3824 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3826 if req_format is None or req_format == 'best':
3828 elif req_format == 'worst':
3829 return [formats[-1]]
3830 elif req_format in ('-1', 'all'):
3833 format = self._specific( req_format, formats )
3835 self._downloader.trouble(u'ERROR: requested format not available')
3841 class PornotubeIE(InfoExtractor):
3842 """Information extractor for pornotube.com."""
3843 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3845 def _real_extract(self, url):
3846 mobj = re.match(self._VALID_URL, url)
3848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3851 video_id = mobj.group('videoid')
3852 video_title = mobj.group('title')
3854 # Get webpage content
3855 webpage = self._download_webpage(url, video_id)
3858 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3859 result = re.search(VIDEO_URL_RE, webpage)
3861 self._downloader.trouble(u'ERROR: unable to extract video url')
3863 video_url = compat_urllib_parse.unquote(result.group('url'))
3865 #Get the uploaded date
3866 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3867 result = re.search(VIDEO_UPLOADED_RE, webpage)
3869 self._downloader.trouble(u'ERROR: unable to extract video title')
3871 upload_date = result.group('date')
3873 info = {'id': video_id,
3876 'upload_date': upload_date,
3877 'title': video_title,
3883 class YouJizzIE(InfoExtractor):
3884 """Information extractor for youjizz.com."""
3885 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3887 def _real_extract(self, url):
3888 mobj = re.match(self._VALID_URL, url)
3890 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3893 video_id = mobj.group('videoid')
3895 # Get webpage content
3896 webpage = self._download_webpage(url, video_id)
3898 # Get the video title
3899 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3901 raise ExtractorError(u'ERROR: unable to extract video title')
3902 video_title = result.group('title').strip()
3904 # Get the embed page
3905 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3907 raise ExtractorError(u'ERROR: unable to extract embed page')
3909 embed_page_url = result.group(0).strip()
3910 video_id = result.group('videoid')
3912 webpage = self._download_webpage(embed_page_url, video_id)
3915 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3917 raise ExtractorError(u'ERROR: unable to extract video url')
3918 video_url = result.group('source')
3920 info = {'id': video_id,
3922 'title': video_title,
3925 'player_url': embed_page_url}
3929 class EightTracksIE(InfoExtractor):
3931 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3933 def _real_extract(self, url):
3934 mobj = re.match(self._VALID_URL, url)
3936 raise ExtractorError(u'Invalid URL: %s' % url)
3937 playlist_id = mobj.group('id')
3939 webpage = self._download_webpage(url, playlist_id)
3941 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3943 raise ExtractorError(u'Cannot find trax information')
3944 json_like = m.group(1)
3945 data = json.loads(json_like)
3947 session = str(random.randint(0, 1000000000))
3949 track_count = data['tracks_count']
3950 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3951 next_url = first_url
3953 for i in itertools.count():
3954 api_json = self._download_webpage(next_url, playlist_id,
3955 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3956 errnote=u'Failed to download song information')
3957 api_data = json.loads(api_json)
3958 track_data = api_data[u'set']['track']
3960 'id': track_data['id'],
3961 'url': track_data['track_file_stream_url'],
3962 'title': track_data['performer'] + u' - ' + track_data['name'],
3963 'raw_title': track_data['name'],
3964 'uploader_id': data['user']['login'],
3968 if api_data['set']['at_last_track']:
3970 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3973 class KeekIE(InfoExtractor):
3974 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3977 def _real_extract(self, url):
3978 m = re.match(self._VALID_URL, url)
3979 video_id = m.group('videoID')
3980 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3981 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3982 webpage = self._download_webpage(url, video_id)
3983 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3984 title = unescapeHTML(m.group('title'))
3985 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3986 uploader = unescapeHTML(m.group('uploader'))
3992 'thumbnail': thumbnail,
3993 'uploader': uploader
3997 class TEDIE(InfoExtractor):
3998 _VALID_URL=r'''http://www.ted.com/
4000 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4002 ((?P<type_talk>talks)) # We have a simple talk
4004 /(?P<name>\w+) # Here goes the name and then ".html"
4007 def suitable(self, url):
4008 """Receives a URL and returns True if suitable for this IE."""
4009 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4011 def _real_extract(self, url):
4012 m=re.match(self._VALID_URL, url, re.VERBOSE)
4013 if m.group('type_talk'):
4014 return [self._talk_info(url)]
4016 playlist_id=m.group('playlist_id')
4017 name=m.group('name')
4018 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4019 return self._playlist_videos_info(url,name,playlist_id)
4021 def _talk_video_link(self,mediaSlug):
4022 '''Returns the video link for that mediaSlug'''
4023 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4025 def _playlist_videos_info(self,url,name,playlist_id=0):
4026 '''Returns the videos of the playlist'''
4028 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4029 ([.\s]*?)data-playlist_item_id="(\d+)"
4030 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4032 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4033 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4034 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4035 m_names=re.finditer(video_name_RE,webpage)
4037 for m_video, m_name in zip(m_videos,m_names):
4038 video_id=m_video.group('video_id')
4039 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4040 info.append(self._talk_info(talk_url,video_id))
4043 def _talk_info(self, url, video_id=0):
4044 """Return the video for the talk in the url"""
4045 m=re.match(self._VALID_URL, url,re.VERBOSE)
4046 videoName=m.group('name')
4047 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4048 # If the url includes the language we get the title translated
4049 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4050 title=re.search(title_RE, webpage).group('title')
4051 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4052 "id":(?P<videoID>[\d]+).*?
4053 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4054 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4055 thumb_match=re.search(thumb_RE,webpage)
4056 info_match=re.search(info_RE,webpage,re.VERBOSE)
4057 video_id=info_match.group('videoID')
4058 mediaSlug=info_match.group('mediaSlug')
4059 video_url=self._talk_video_link(mediaSlug)
4065 'thumbnail': thumb_match.group('thumbnail')
4069 class MySpassIE(InfoExtractor):
4070 _VALID_URL = r'http://www.myspass.de/.*'
4072 def _real_extract(self, url):
4073 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4075 # video id is the last path element of the URL
4076 # usually there is a trailing slash, so also try the second but last
4077 url_path = compat_urllib_parse_urlparse(url).path
4078 url_parent_path, video_id = os.path.split(url_path)
4080 _, video_id = os.path.split(url_parent_path)
4083 metadata_url = META_DATA_URL_TEMPLATE % video_id
4084 metadata_text = self._download_webpage(metadata_url, video_id)
4085 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4087 # extract values from metadata
4088 url_flv_el = metadata.find('url_flv')
4089 if url_flv_el is None:
4090 self._downloader.trouble(u'ERROR: unable to extract download url')
4092 video_url = url_flv_el.text
4093 extension = os.path.splitext(video_url)[1][1:]
4094 title_el = metadata.find('title')
4095 if title_el is None:
4096 self._downloader.trouble(u'ERROR: unable to extract title')
4098 title = title_el.text
4099 format_id_el = metadata.find('format_id')
4100 if format_id_el is None:
4103 format = format_id_el.text
4104 description_el = metadata.find('description')
4105 if description_el is not None:
4106 description = description_el.text
4109 imagePreview_el = metadata.find('imagePreview')
4110 if imagePreview_el is not None:
4111 thumbnail = imagePreview_el.text
4120 'thumbnail': thumbnail,
4121 'description': description
4125 def gen_extractors():
4126 """ Return a list of an instance of every supported extractor.
4127 The order does matter; the first extractor matched is the one handling the URL.
4130 YoutubePlaylistIE(),
4154 StanfordOpenClassroomIE(),