2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
269 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
270 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
271 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
273 return (u'WARNING: unable to download video subtitles', None)
274 return (None, self._closed_captions_xml_to_srt(srt_xml))
276 def _print_formats(self, formats):
277 print('Available formats:')
279 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
281 def _real_initialize(self):
282 if self._downloader is None:
287 downloader_params = self._downloader.params
289 # Attempt to use provided username and password or .netrc data
290 if downloader_params.get('username', None) is not None:
291 username = downloader_params['username']
292 password = downloader_params['password']
293 elif downloader_params.get('usenetrc', False):
295 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
300 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
301 except (IOError, netrc.NetrcParseError) as err:
302 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
306 request = compat_urllib_request.Request(self._LANG_URL)
309 compat_urllib_request.urlopen(request).read()
310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
311 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
314 # No authentication to be performed
320 'current_form': 'loginForm',
322 'action_login': 'Log In',
323 'username': username,
324 'password': password,
326 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
329 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
330 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
331 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
333 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
334 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
340 'action_confirm': 'Confirm',
342 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
344 self.report_age_confirmation()
345 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
350 def _extract_id(self, url):
351 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
353 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
355 video_id = mobj.group(2)
358 def _real_extract(self, url):
359 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
360 mobj = re.search(self._NEXT_URL_RE, url)
362 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
363 video_id = self._extract_id(url)
366 self.report_video_webpage_download(video_id)
367 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
368 request = compat_urllib_request.Request(url)
370 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
372 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
375 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
377 # Attempt to extract SWF player URL
378 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
380 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
385 self.report_video_info_webpage_download(video_id)
386 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
387 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
388 % (video_id, el_type))
389 request = compat_urllib_request.Request(video_info_url)
391 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
392 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
393 video_info = compat_parse_qs(video_info_webpage)
394 if 'token' in video_info:
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
399 if 'token' not in video_info:
400 if 'reason' in video_info:
401 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
403 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
406 # Check for "rental" videos
407 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
408 self._downloader.trouble(u'ERROR: "rental" videos not supported')
411 # Start extracting information
412 self.report_information_extraction(video_id)
415 if 'author' not in video_info:
416 self._downloader.trouble(u'ERROR: unable to extract uploader name')
418 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
421 video_uploader_id = None
422 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
424 video_uploader_id = mobj.group(1)
426 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
429 if 'title' not in video_info:
430 self._downloader.trouble(u'ERROR: unable to extract video title')
432 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
435 if 'thumbnail_url' not in video_info:
436 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
438 else: # don't panic if we can't find it
439 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
443 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
445 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
446 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
447 for expression in format_expressions:
449 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
454 video_description = get_element_by_id("eow-description", video_webpage)
455 if video_description:
456 video_description = clean_html(video_description)
458 video_description = ''
461 video_subtitles = None
462 if self._downloader.params.get('writesubtitles', False):
463 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
465 self._downloader.trouble(srt_error)
467 if 'length_seconds' not in video_info:
468 self._downloader.trouble(u'WARNING: unable to extract video duration')
471 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
474 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
476 # Decide which formats to download
477 req_format = self._downloader.params.get('format', None)
479 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
480 self.report_rtmp_download()
481 video_url_list = [(None, video_info['conn'][0])]
482 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
483 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
484 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
485 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
486 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
488 format_limit = self._downloader.params.get('format_limit', None)
489 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
490 if format_limit is not None and format_limit in available_formats:
491 format_list = available_formats[available_formats.index(format_limit):]
493 format_list = available_formats
494 existing_formats = [x for x in format_list if x in url_map]
495 if len(existing_formats) == 0:
496 self._downloader.trouble(u'ERROR: no known formats available for video')
498 if self._downloader.params.get('listformats', None):
499 self._print_formats(existing_formats)
501 if req_format is None or req_format == 'best':
502 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
503 elif req_format == 'worst':
504 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
505 elif req_format in ('-1', 'all'):
506 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
508 # Specific formats. We pick the first in a slash-delimeted sequence.
509 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
510 req_formats = req_format.split('/')
511 video_url_list = None
512 for rf in req_formats:
514 video_url_list = [(rf, url_map[rf])]
516 if video_url_list is None:
517 self._downloader.trouble(u'ERROR: requested format not available')
520 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
524 for format_param, video_real_url in video_url_list:
526 video_extension = self._video_extensions.get(format_param, 'flv')
528 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
529 self._video_dimensions.get(format_param, '???'))
533 'url': video_real_url,
534 'uploader': video_uploader,
535 'uploader_id': video_uploader_id,
536 'upload_date': upload_date,
537 'title': video_title,
538 'ext': video_extension,
539 'format': video_format,
540 'thumbnail': video_thumbnail,
541 'description': video_description,
542 'player_url': player_url,
543 'subtitles': video_subtitles,
544 'duration': video_duration
549 class MetacafeIE(InfoExtractor):
550 """Information Extractor for metacafe.com."""
552 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
553 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
554 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
555 IE_NAME = u'metacafe'
557 def __init__(self, downloader=None):
558 InfoExtractor.__init__(self, downloader)
560 def report_disclaimer(self):
561 """Report disclaimer retrieval."""
562 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
564 def report_age_confirmation(self):
565 """Report attempt to confirm age."""
566 self._downloader.to_screen(u'[metacafe] Confirming age')
568 def report_download_webpage(self, video_id):
569 """Report webpage download."""
570 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
572 def report_extraction(self, video_id):
573 """Report information extraction."""
574 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
576 def _real_initialize(self):
577 # Retrieve disclaimer
578 request = compat_urllib_request.Request(self._DISCLAIMER)
580 self.report_disclaimer()
581 disclaimer = compat_urllib_request.urlopen(request).read()
582 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
583 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
589 'submit': "Continue - I'm over 18",
591 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
593 self.report_age_confirmation()
594 disclaimer = compat_urllib_request.urlopen(request).read()
595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
596 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
599 def _real_extract(self, url):
600 # Extract id and simplified title from URL
601 mobj = re.match(self._VALID_URL, url)
603 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
606 video_id = mobj.group(1)
608 # Check if video comes from YouTube
609 mobj2 = re.match(r'^yt-(.*)$', video_id)
610 if mobj2 is not None:
611 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
614 # Retrieve video webpage to extract further information
615 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
617 self.report_download_webpage(video_id)
618 webpage = compat_urllib_request.urlopen(request).read()
619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
623 # Extract URL, uploader and title from webpage
624 self.report_extraction(video_id)
625 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
627 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
628 video_extension = mediaURL[-3:]
630 # Extract gdaKey if available
631 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
635 gdaKey = mobj.group(1)
636 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
638 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
640 self._downloader.trouble(u'ERROR: unable to extract media URL')
642 vardict = compat_parse_qs(mobj.group(1))
643 if 'mediaData' not in vardict:
644 self._downloader.trouble(u'ERROR: unable to extract media URL')
646 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
648 self._downloader.trouble(u'ERROR: unable to extract media URL')
650 mediaURL = mobj.group(1).replace('\\/', '/')
651 video_extension = mediaURL[-3:]
652 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
654 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
656 self._downloader.trouble(u'ERROR: unable to extract title')
658 video_title = mobj.group(1).decode('utf-8')
660 mobj = re.search(r'submitter=(.*?);', webpage)
662 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
664 video_uploader = mobj.group(1)
667 'id': video_id.decode('utf-8'),
668 'url': video_url.decode('utf-8'),
669 'uploader': video_uploader.decode('utf-8'),
671 'title': video_title,
672 'ext': video_extension.decode('utf-8'),
676 class DailymotionIE(InfoExtractor):
677 """Information Extractor for Dailymotion"""
679 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
680 IE_NAME = u'dailymotion'
682 def __init__(self, downloader=None):
683 InfoExtractor.__init__(self, downloader)
685 def report_extraction(self, video_id):
686 """Report information extraction."""
687 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
689 def _real_extract(self, url):
690 # Extract id and simplified title from URL
691 mobj = re.match(self._VALID_URL, url)
693 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
696 video_id = mobj.group(1).split('_')[0].split('?')[0]
698 video_extension = 'mp4'
700 # Retrieve video webpage to extract further information
701 request = compat_urllib_request.Request(url)
702 request.add_header('Cookie', 'family_filter=off')
703 webpage = self._download_webpage(request, video_id)
705 # Extract URL, uploader and title from webpage
706 self.report_extraction(video_id)
707 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
709 self._downloader.trouble(u'ERROR: unable to extract media URL')
711 flashvars = compat_urllib_parse.unquote(mobj.group(1))
713 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
716 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
719 self._downloader.trouble(u'ERROR: unable to extract video URL')
722 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
724 self._downloader.trouble(u'ERROR: unable to extract video URL')
727 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
729 # TODO: support choosing qualities
731 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
733 self._downloader.trouble(u'ERROR: unable to extract title')
735 video_title = unescapeHTML(mobj.group('title'))
737 video_uploader = None
738 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
740 # lookin for official user
741 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
742 if mobj_official is None:
743 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
745 video_uploader = mobj_official.group(1)
747 video_uploader = mobj.group(1)
749 video_upload_date = None
750 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
752 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
757 'uploader': video_uploader,
758 'upload_date': video_upload_date,
759 'title': video_title,
760 'ext': video_extension,
764 class PhotobucketIE(InfoExtractor):
765 """Information extractor for photobucket.com."""
767 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
768 IE_NAME = u'photobucket'
770 def __init__(self, downloader=None):
771 InfoExtractor.__init__(self, downloader)
773 def report_download_webpage(self, video_id):
774 """Report webpage download."""
775 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
777 def report_extraction(self, video_id):
778 """Report information extraction."""
779 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
781 def _real_extract(self, url):
782 # Extract id from URL
783 mobj = re.match(self._VALID_URL, url)
785 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
788 video_id = mobj.group(1)
790 video_extension = 'flv'
792 # Retrieve video webpage to extract further information
793 request = compat_urllib_request.Request(url)
795 self.report_download_webpage(video_id)
796 webpage = compat_urllib_request.urlopen(request).read()
797 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
798 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
801 # Extract URL, uploader, and title from webpage
802 self.report_extraction(video_id)
803 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
805 self._downloader.trouble(u'ERROR: unable to extract media URL')
807 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
811 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
813 self._downloader.trouble(u'ERROR: unable to extract title')
815 video_title = mobj.group(1).decode('utf-8')
817 video_uploader = mobj.group(2).decode('utf-8')
820 'id': video_id.decode('utf-8'),
821 'url': video_url.decode('utf-8'),
822 'uploader': video_uploader,
824 'title': video_title,
825 'ext': video_extension.decode('utf-8'),
829 class YahooIE(InfoExtractor):
830 """Information extractor for video.yahoo.com."""
833 # _VALID_URL matches all Yahoo! Video URLs
834 # _VPAGE_URL matches only the extractable '/watch/' URLs
835 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
836 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
837 IE_NAME = u'video.yahoo'
839 def __init__(self, downloader=None):
840 InfoExtractor.__init__(self, downloader)
842 def report_download_webpage(self, video_id):
843 """Report webpage download."""
844 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
846 def report_extraction(self, video_id):
847 """Report information extraction."""
848 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
850 def _real_extract(self, url, new_video=True):
851 # Extract ID from URL
852 mobj = re.match(self._VALID_URL, url)
854 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
857 video_id = mobj.group(2)
858 video_extension = 'flv'
860 # Rewrite valid but non-extractable URLs as
861 # extractable English language /watch/ URLs
862 if re.match(self._VPAGE_URL, url) is None:
863 request = compat_urllib_request.Request(url)
865 webpage = compat_urllib_request.urlopen(request).read()
866 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
867 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
870 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
872 self._downloader.trouble(u'ERROR: Unable to extract id field')
874 yahoo_id = mobj.group(1)
876 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
878 self._downloader.trouble(u'ERROR: Unable to extract vid field')
880 yahoo_vid = mobj.group(1)
882 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
883 return self._real_extract(url, new_video=False)
885 # Retrieve video webpage to extract further information
886 request = compat_urllib_request.Request(url)
888 self.report_download_webpage(video_id)
889 webpage = compat_urllib_request.urlopen(request).read()
890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
894 # Extract uploader and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
898 self._downloader.trouble(u'ERROR: unable to extract video title')
900 video_title = mobj.group(1).decode('utf-8')
902 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video uploader')
906 video_uploader = mobj.group(1).decode('utf-8')
908 # Extract video thumbnail
909 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
913 video_thumbnail = mobj.group(1).decode('utf-8')
915 # Extract video description
916 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
918 self._downloader.trouble(u'ERROR: unable to extract video description')
920 video_description = mobj.group(1).decode('utf-8')
921 if not video_description:
922 video_description = 'No description available.'
924 # Extract video height and width
925 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
927 self._downloader.trouble(u'ERROR: unable to extract video height')
929 yv_video_height = mobj.group(1)
931 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
933 self._downloader.trouble(u'ERROR: unable to extract video width')
935 yv_video_width = mobj.group(1)
937 # Retrieve video playlist to extract media URL
938 # I'm not completely sure what all these options are, but we
939 # seem to need most of them, otherwise the server sends a 401.
940 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
941 yv_bitrate = '700' # according to Wikipedia this is hard-coded
942 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
943 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
944 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
946 self.report_download_webpage(video_id)
947 webpage = compat_urllib_request.urlopen(request).read()
948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
952 # Extract media URL from playlist XML
953 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
955 self._downloader.trouble(u'ERROR: Unable to extract media URL')
957 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
958 video_url = unescapeHTML(video_url)
961 'id': video_id.decode('utf-8'),
963 'uploader': video_uploader,
965 'title': video_title,
966 'ext': video_extension.decode('utf-8'),
967 'thumbnail': video_thumbnail.decode('utf-8'),
968 'description': video_description,
972 class VimeoIE(InfoExtractor):
973 """Information extractor for vimeo.com."""
975 # _VALID_URL matches Vimeo URLs
976 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
979 def __init__(self, downloader=None):
980 InfoExtractor.__init__(self, downloader)
982 def report_download_webpage(self, video_id):
983 """Report webpage download."""
984 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
986 def report_extraction(self, video_id):
987 """Report information extraction."""
988 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
990 def _real_extract(self, url, new_video=True):
991 # Extract ID from URL
992 mobj = re.match(self._VALID_URL, url)
994 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
997 video_id = mobj.group(1)
999 # Retrieve video webpage to extract further information
1000 request = compat_urllib_request.Request(url, None, std_headers)
1002 self.report_download_webpage(video_id)
1003 webpage_bytes = compat_urllib_request.urlopen(request).read()
1004 webpage = webpage_bytes.decode('utf-8')
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Now we begin extracting as much information as we can from what we
1010 # retrieved. First we extract the information common to all extractors,
1011 # and latter we extract those that are Vimeo specific.
1012 self.report_extraction(video_id)
1014 # Extract the config JSON
1016 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017 config = json.loads(config)
1019 self._downloader.trouble(u'ERROR: unable to extract info section')
1023 video_title = config["video"]["title"]
1025 # Extract uploader and uploader_id
1026 video_uploader = config["video"]["owner"]["name"]
1027 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1029 # Extract video thumbnail
1030 video_thumbnail = config["video"]["thumbnail"]
1032 # Extract video description
1033 video_description = get_element_by_attribute("itemprop", "description", webpage)
1034 if video_description: video_description = clean_html(video_description)
1035 else: video_description = ''
1037 # Extract upload date
1038 video_upload_date = None
1039 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1040 if mobj is not None:
1041 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1043 # Vimeo specific: extract request signature and timestamp
1044 sig = config['request']['signature']
1045 timestamp = config['request']['timestamp']
1047 # Vimeo specific: extract video codec and quality information
1048 # First consider quality, then codecs, then take everything
1049 # TODO bind to format param
1050 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1051 files = { 'hd': [], 'sd': [], 'other': []}
1052 for codec_name, codec_extension in codecs:
1053 if codec_name in config["video"]["files"]:
1054 if 'hd' in config["video"]["files"][codec_name]:
1055 files['hd'].append((codec_name, codec_extension, 'hd'))
1056 elif 'sd' in config["video"]["files"][codec_name]:
1057 files['sd'].append((codec_name, codec_extension, 'sd'))
1059 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1061 for quality in ('hd', 'sd', 'other'):
1062 if len(files[quality]) > 0:
1063 video_quality = files[quality][0][2]
1064 video_codec = files[quality][0][0]
1065 video_extension = files[quality][0][1]
1066 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1069 self._downloader.trouble(u'ERROR: no known codec found')
1072 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1073 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1078 'uploader': video_uploader,
1079 'uploader_id': video_uploader_id,
1080 'upload_date': video_upload_date,
1081 'title': video_title,
1082 'ext': video_extension,
1083 'thumbnail': video_thumbnail,
1084 'description': video_description,
1088 class ArteTvIE(InfoExtractor):
1089 """arte.tv information extractor."""
1091 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1092 _LIVE_URL = r'index-[0-9]+\.html$'
1094 IE_NAME = u'arte.tv'
1096 def __init__(self, downloader=None):
1097 InfoExtractor.__init__(self, downloader)
1099 def report_download_webpage(self, video_id):
1100 """Report webpage download."""
1101 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1103 def report_extraction(self, video_id):
1104 """Report information extraction."""
1105 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1107 def fetch_webpage(self, url):
1108 request = compat_urllib_request.Request(url)
1110 self.report_download_webpage(url)
1111 webpage = compat_urllib_request.urlopen(request).read()
1112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1113 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1115 except ValueError as err:
1116 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1121 page = self.fetch_webpage(url)
1122 mobj = re.search(regex, page, regexFlags)
1126 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1129 for (i, key, err) in matchTuples:
1130 if mobj.group(i) is None:
1131 self._downloader.trouble(err)
1134 info[key] = mobj.group(i)
1138 def extractLiveStream(self, url):
1139 video_lang = url.split('/')[-4]
1140 info = self.grep_webpage(
1142 r'src="(.*?/videothek_js.*?\.js)',
1145 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1148 http_host = url.split('/')[2]
1149 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1150 info = self.grep_webpage(
1152 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1153 '(http://.*?\.swf).*?' +
1157 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1158 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1159 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1162 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1164 def extractPlus7Stream(self, url):
1165 video_lang = url.split('/')[-3]
1166 info = self.grep_webpage(
1168 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1171 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1174 next_url = compat_urllib_parse.unquote(info.get('url'))
1175 info = self.grep_webpage(
1177 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1180 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1183 next_url = compat_urllib_parse.unquote(info.get('url'))
1185 info = self.grep_webpage(
1187 r'<video id="(.*?)".*?>.*?' +
1188 '<name>(.*?)</name>.*?' +
1189 '<dateVideo>(.*?)</dateVideo>.*?' +
1190 '<url quality="hd">(.*?)</url>',
1193 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1194 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1195 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1196 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1201 'id': info.get('id'),
1202 'url': compat_urllib_parse.unquote(info.get('url')),
1203 'uploader': u'arte.tv',
1204 'upload_date': info.get('date'),
1205 'title': info.get('title').decode('utf-8'),
1211 def _real_extract(self, url):
1212 video_id = url.split('/')[-1]
1213 self.report_extraction(video_id)
1215 if re.search(self._LIVE_URL, video_id) is not None:
1216 self.extractLiveStream(url)
1219 info = self.extractPlus7Stream(url)
1224 class GenericIE(InfoExtractor):
1225 """Generic last-resort information extractor."""
1228 IE_NAME = u'generic'
1230 def __init__(self, downloader=None):
1231 InfoExtractor.__init__(self, downloader)
1233 def report_download_webpage(self, video_id):
1234 """Report webpage download."""
1235 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1236 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1238 def report_extraction(self, video_id):
1239 """Report information extraction."""
1240 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1242 def report_following_redirect(self, new_url):
1243 """Report information extraction."""
1244 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1246 def _test_redirect(self, url):
1247 """Check if it is a redirect, like url shorteners, in case restart chain."""
1248 class HeadRequest(compat_urllib_request.Request):
1249 def get_method(self):
1252 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1254 Subclass the HTTPRedirectHandler to make it use our
1255 HeadRequest also on the redirected URL
1257 def redirect_request(self, req, fp, code, msg, headers, newurl):
1258 if code in (301, 302, 303, 307):
1259 newurl = newurl.replace(' ', '%20')
1260 newheaders = dict((k,v) for k,v in req.headers.items()
1261 if k.lower() not in ("content-length", "content-type"))
1262 return HeadRequest(newurl,
1264 origin_req_host=req.get_origin_req_host(),
1267 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1269 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1271 Fallback to GET if HEAD is not allowed (405 HTTP error)
1273 def http_error_405(self, req, fp, code, msg, headers):
1277 newheaders = dict((k,v) for k,v in req.headers.items()
1278 if k.lower() not in ("content-length", "content-type"))
1279 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1281 origin_req_host=req.get_origin_req_host(),
1285 opener = compat_urllib_request.OpenerDirector()
1286 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1287 HTTPMethodFallback, HEADRedirectHandler,
1288 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1289 opener.add_handler(handler())
1291 response = opener.open(HeadRequest(url))
1292 new_url = response.geturl()
1297 self.report_following_redirect(new_url)
1298 self._downloader.download([new_url])
1301 def _real_extract(self, url):
1302 if self._test_redirect(url): return
1304 video_id = url.split('/')[-1]
1305 request = compat_urllib_request.Request(url)
1307 self.report_download_webpage(video_id)
1308 webpage = compat_urllib_request.urlopen(request).read()
1309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1310 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1312 except ValueError as err:
1313 # since this is the last-resort InfoExtractor, if
1314 # this error is thrown, it'll be thrown here
1315 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318 self.report_extraction(video_id)
1319 # Start with something easy: JW Player in SWFObject
1320 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1322 # Broaden the search a little bit
1323 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1325 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1328 # It's possible that one of the regexes
1329 # matched, but returned an empty group:
1330 if mobj.group(1) is None:
1331 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1334 video_url = compat_urllib_parse.unquote(mobj.group(1))
1335 video_id = os.path.basename(video_url)
1337 # here's a fun little line of code for you:
1338 video_extension = os.path.splitext(video_id)[1][1:]
1339 video_id = os.path.splitext(video_id)[0]
1341 # it's tempting to parse this further, but you would
1342 # have to take into account all the variations like
1343 # Video Title - Site Name
1344 # Site Name | Video Title
1345 # Video Title - Tagline | Site Name
1346 # and so on and so forth; it's just not practical
1347 mobj = re.search(r'<title>(.*)</title>', webpage)
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1351 video_title = mobj.group(1)
1353 # video uploader is domain name
1354 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1356 self._downloader.trouble(u'ERROR: unable to extract title')
1358 video_uploader = mobj.group(1)
1363 'uploader': video_uploader,
1364 'upload_date': None,
1365 'title': video_title,
1366 'ext': video_extension,
1370 class YoutubeSearchIE(InfoExtractor):
1371 """Information Extractor for YouTube search queries."""
1372 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1373 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1374 _max_youtube_results = 1000
1375 IE_NAME = u'youtube:search'
1377 def __init__(self, downloader=None):
1378 InfoExtractor.__init__(self, downloader)
1380 def report_download_page(self, query, pagenum):
1381 """Report attempt to download search page with given number."""
1382 query = query.decode(preferredencoding())
1383 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1385 def _real_extract(self, query):
1386 mobj = re.match(self._VALID_URL, query)
1388 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1391 prefix, query = query.split(':')
1393 query = query.encode('utf-8')
1395 self._download_n_results(query, 1)
1397 elif prefix == 'all':
1398 self._download_n_results(query, self._max_youtube_results)
1404 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1406 elif n > self._max_youtube_results:
1407 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1408 n = self._max_youtube_results
1409 self._download_n_results(query, n)
1411 except ValueError: # parsing prefix as integer fails
1412 self._download_n_results(query, 1)
1415 def _download_n_results(self, query, n):
1416 """Downloads a specified number of results for a query"""
1422 while (50 * pagenum) < limit:
1423 self.report_download_page(query, pagenum+1)
1424 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1425 request = compat_urllib_request.Request(result_url)
1427 data = compat_urllib_request.urlopen(request).read()
1428 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1429 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1431 api_response = json.loads(data)['data']
1433 new_ids = list(video['id'] for video in api_response['items'])
1434 video_ids += new_ids
1436 limit = min(n, api_response['totalItems'])
1439 if len(video_ids) > n:
1440 video_ids = video_ids[:n]
1441 for id in video_ids:
1442 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1446 class GoogleSearchIE(InfoExtractor):
1447 """Information Extractor for Google Video search queries."""
1448 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1449 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1450 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1451 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1452 _max_google_results = 1000
1453 IE_NAME = u'video.google:search'
1455 def __init__(self, downloader=None):
1456 InfoExtractor.__init__(self, downloader)
1458 def report_download_page(self, query, pagenum):
1459 """Report attempt to download playlist page with given number."""
1460 query = query.decode(preferredencoding())
1461 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1463 def _real_extract(self, query):
1464 mobj = re.match(self._VALID_URL, query)
1466 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1469 prefix, query = query.split(':')
1471 query = query.encode('utf-8')
1473 self._download_n_results(query, 1)
1475 elif prefix == 'all':
1476 self._download_n_results(query, self._max_google_results)
1482 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1484 elif n > self._max_google_results:
1485 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1486 n = self._max_google_results
1487 self._download_n_results(query, n)
1489 except ValueError: # parsing prefix as integer fails
1490 self._download_n_results(query, 1)
1493 def _download_n_results(self, query, n):
1494 """Downloads a specified number of results for a query"""
1500 self.report_download_page(query, pagenum)
1501 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1502 request = compat_urllib_request.Request(result_url)
1504 page = compat_urllib_request.urlopen(request).read()
1505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1506 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1509 # Extract video identifiers
1510 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1511 video_id = mobj.group(1)
1512 if video_id not in video_ids:
1513 video_ids.append(video_id)
1514 if len(video_ids) == n:
1515 # Specified n videos reached
1516 for id in video_ids:
1517 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1520 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1521 for id in video_ids:
1522 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1525 pagenum = pagenum + 1
1528 class YahooSearchIE(InfoExtractor):
1529 """Information Extractor for Yahoo! Video search queries."""
1532 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1533 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1534 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1535 _MORE_PAGES_INDICATOR = r'\s*Next'
1536 _max_yahoo_results = 1000
1537 IE_NAME = u'video.yahoo:search'
1539 def __init__(self, downloader=None):
1540 InfoExtractor.__init__(self, downloader)
1542 def report_download_page(self, query, pagenum):
1543 """Report attempt to download playlist page with given number."""
1544 query = query.decode(preferredencoding())
1545 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1547 def _real_extract(self, query):
1548 mobj = re.match(self._VALID_URL, query)
1550 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1553 prefix, query = query.split(':')
1555 query = query.encode('utf-8')
1557 self._download_n_results(query, 1)
1559 elif prefix == 'all':
1560 self._download_n_results(query, self._max_yahoo_results)
1566 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1568 elif n > self._max_yahoo_results:
1569 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1570 n = self._max_yahoo_results
1571 self._download_n_results(query, n)
1573 except ValueError: # parsing prefix as integer fails
1574 self._download_n_results(query, 1)
1577 def _download_n_results(self, query, n):
1578 """Downloads a specified number of results for a query"""
1581 already_seen = set()
1585 self.report_download_page(query, pagenum)
1586 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1587 request = compat_urllib_request.Request(result_url)
1589 page = compat_urllib_request.urlopen(request).read()
1590 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1591 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1594 # Extract video identifiers
1595 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1596 video_id = mobj.group(1)
1597 if video_id not in already_seen:
1598 video_ids.append(video_id)
1599 already_seen.add(video_id)
1600 if len(video_ids) == n:
1601 # Specified n videos reached
1602 for id in video_ids:
1603 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1606 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1607 for id in video_ids:
1608 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1611 pagenum = pagenum + 1
1614 class YoutubePlaylistIE(InfoExtractor):
1615 """Information Extractor for YouTube playlists."""
1617 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1618 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1619 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1620 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1621 IE_NAME = u'youtube:playlist'
1623 def __init__(self, downloader=None):
1624 InfoExtractor.__init__(self, downloader)
1626 def report_download_page(self, playlist_id, pagenum):
1627 """Report attempt to download playlist page with given number."""
1628 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1630 def _real_extract(self, url):
1631 # Extract playlist id
1632 mobj = re.match(self._VALID_URL, url)
1634 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1638 if mobj.group(3) is not None:
1639 self._downloader.download([mobj.group(3)])
1642 # Download playlist pages
1643 # prefix is 'p' as default for playlists but there are other types that need extra care
1644 playlist_prefix = mobj.group(1)
1645 if playlist_prefix == 'a':
1646 playlist_access = 'artist'
1648 playlist_prefix = 'p'
1649 playlist_access = 'view_play_list'
1650 playlist_id = mobj.group(2)
1655 self.report_download_page(playlist_id, pagenum)
1656 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1657 request = compat_urllib_request.Request(url)
1659 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1660 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1661 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1664 # Extract video identifiers
1666 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1667 if mobj.group(1) not in ids_in_page:
1668 ids_in_page.append(mobj.group(1))
1669 video_ids.extend(ids_in_page)
1671 if self._MORE_PAGES_INDICATOR not in page:
1673 pagenum = pagenum + 1
1675 total = len(video_ids)
1677 playliststart = self._downloader.params.get('playliststart', 1) - 1
1678 playlistend = self._downloader.params.get('playlistend', -1)
1679 if playlistend == -1:
1680 video_ids = video_ids[playliststart:]
1682 video_ids = video_ids[playliststart:playlistend]
1684 if len(video_ids) == total:
1685 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1687 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1689 for id in video_ids:
1690 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1694 class YoutubeChannelIE(InfoExtractor):
1695 """Information Extractor for YouTube channels."""
1697 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1698 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1699 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1700 IE_NAME = u'youtube:channel'
1702 def report_download_page(self, channel_id, pagenum):
1703 """Report attempt to download channel page with given number."""
1704 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1706 def _real_extract(self, url):
1707 # Extract channel id
1708 mobj = re.match(self._VALID_URL, url)
1710 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1713 # Download channel pages
1714 channel_id = mobj.group(1)
1719 self.report_download_page(channel_id, pagenum)
1720 url = self._TEMPLATE_URL % (channel_id, pagenum)
1721 request = compat_urllib_request.Request(url)
1723 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1724 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1728 # Extract video identifiers
1730 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1731 if mobj.group(1) not in ids_in_page:
1732 ids_in_page.append(mobj.group(1))
1733 video_ids.extend(ids_in_page)
1735 if self._MORE_PAGES_INDICATOR not in page:
1737 pagenum = pagenum + 1
1739 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1741 for id in video_ids:
1742 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1746 class YoutubeUserIE(InfoExtractor):
1747 """Information Extractor for YouTube users."""
1749 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1750 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1751 _GDATA_PAGE_SIZE = 50
1752 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1753 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1754 IE_NAME = u'youtube:user'
1756 def __init__(self, downloader=None):
1757 InfoExtractor.__init__(self, downloader)
1759 def report_download_page(self, username, start_index):
1760 """Report attempt to download user page."""
1761 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1762 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1764 def _real_extract(self, url):
1766 mobj = re.match(self._VALID_URL, url)
1768 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1771 username = mobj.group(1)
1773 # Download video ids using YouTube Data API. Result size per
1774 # query is limited (currently to 50 videos) so we need to query
1775 # page by page until there are no video ids - it means we got
1782 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1783 self.report_download_page(username, start_index)
1785 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1788 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1789 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1790 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1793 # Extract video identifiers
1796 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1797 if mobj.group(1) not in ids_in_page:
1798 ids_in_page.append(mobj.group(1))
1800 video_ids.extend(ids_in_page)
1802 # A little optimization - if current page is not
1803 # "full", ie. does not contain PAGE_SIZE video ids then
1804 # we can assume that this page is the last one - there
1805 # are no more ids on further pages - no need to query
1808 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1813 all_ids_count = len(video_ids)
1814 playliststart = self._downloader.params.get('playliststart', 1) - 1
1815 playlistend = self._downloader.params.get('playlistend', -1)
1817 if playlistend == -1:
1818 video_ids = video_ids[playliststart:]
1820 video_ids = video_ids[playliststart:playlistend]
1822 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1823 (username, all_ids_count, len(video_ids)))
1825 for video_id in video_ids:
1826 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1829 class BlipTVUserIE(InfoExtractor):
1830 """Information Extractor for blip.tv users."""
1832 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1834 IE_NAME = u'blip.tv:user'
1836 def __init__(self, downloader=None):
1837 InfoExtractor.__init__(self, downloader)
1839 def report_download_page(self, username, pagenum):
1840 """Report attempt to download user page."""
1841 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1842 (self.IE_NAME, username, pagenum))
1844 def _real_extract(self, url):
1846 mobj = re.match(self._VALID_URL, url)
1848 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1851 username = mobj.group(1)
1853 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1855 request = compat_urllib_request.Request(url)
1858 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859 mobj = re.search(r'data-users-id="([^"]+)"', page)
1860 page_base = page_base % mobj.group(1)
1861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1866 # Download video ids using BlipTV Ajax calls. Result size per
1867 # query is limited (currently to 12 videos) so we need to query
1868 # page by page until there are no video ids - it means we got
1875 self.report_download_page(username, pagenum)
1877 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1880 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1881 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1882 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1885 # Extract video identifiers
1888 for mobj in re.finditer(r'href="/([^"]+)"', page):
1889 if mobj.group(1) not in ids_in_page:
1890 ids_in_page.append(unescapeHTML(mobj.group(1)))
1892 video_ids.extend(ids_in_page)
1894 # A little optimization - if current page is not
1895 # "full", ie. does not contain PAGE_SIZE video ids then
1896 # we can assume that this page is the last one - there
1897 # are no more ids on further pages - no need to query
1900 if len(ids_in_page) < self._PAGE_SIZE:
1905 all_ids_count = len(video_ids)
1906 playliststart = self._downloader.params.get('playliststart', 1) - 1
1907 playlistend = self._downloader.params.get('playlistend', -1)
1909 if playlistend == -1:
1910 video_ids = video_ids[playliststart:]
1912 video_ids = video_ids[playliststart:playlistend]
1914 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1915 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1917 for video_id in video_ids:
1918 self._downloader.download([u'http://blip.tv/'+video_id])
1921 class DepositFilesIE(InfoExtractor):
1922 """Information extractor for depositfiles.com"""
1924 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1926 def report_download_webpage(self, file_id):
1927 """Report webpage download."""
1928 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1930 def report_extraction(self, file_id):
1931 """Report information extraction."""
1932 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1934 def _real_extract(self, url):
1935 file_id = url.split('/')[-1]
1936 # Rebuild url in english locale
1937 url = 'http://depositfiles.com/en/files/' + file_id
1939 # Retrieve file webpage with 'Free download' button pressed
1940 free_download_indication = { 'gateway_result' : '1' }
1941 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1943 self.report_download_webpage(file_id)
1944 webpage = compat_urllib_request.urlopen(request).read()
1945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1946 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1949 # Search for the real file URL
1950 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1951 if (mobj is None) or (mobj.group(1) is None):
1952 # Try to figure out reason of the error.
1953 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1954 if (mobj is not None) and (mobj.group(1) is not None):
1955 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1956 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1958 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1961 file_url = mobj.group(1)
1962 file_extension = os.path.splitext(file_url)[1][1:]
1964 # Search for file title
1965 mobj = re.search(r'<b title="(.*?)">', webpage)
1967 self._downloader.trouble(u'ERROR: unable to extract title')
1969 file_title = mobj.group(1).decode('utf-8')
1972 'id': file_id.decode('utf-8'),
1973 'url': file_url.decode('utf-8'),
1975 'upload_date': None,
1976 'title': file_title,
1977 'ext': file_extension.decode('utf-8'),
1981 class FacebookIE(InfoExtractor):
1982 """Information Extractor for Facebook"""
1984 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986 _NETRC_MACHINE = 'facebook'
1987 IE_NAME = u'facebook'
1989 def report_login(self):
1990 """Report attempt to log in."""
1991 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1993 def _real_initialize(self):
1994 if self._downloader is None:
1999 downloader_params = self._downloader.params
2001 # Attempt to use provided username and password or .netrc data
2002 if downloader_params.get('username', None) is not None:
2003 useremail = downloader_params['username']
2004 password = downloader_params['password']
2005 elif downloader_params.get('usenetrc', False):
2007 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2008 if info is not None:
2012 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2013 except (IOError, netrc.NetrcParseError) as err:
2014 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2017 if useremail is None:
2026 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2029 login_results = compat_urllib_request.urlopen(request).read()
2030 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2031 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2033 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2037 def _real_extract(self, url):
2038 mobj = re.match(self._VALID_URL, url)
2040 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2042 video_id = mobj.group('ID')
2044 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2045 webpage = self._download_webpage(url, video_id)
2047 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2048 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2049 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2051 raise ExtractorError(u'Cannot parse data')
2052 data = dict(json.loads(m.group(1)))
2053 params_raw = compat_urllib_parse.unquote(data['params'])
2054 params = json.loads(params_raw)
2055 video_url = params['hd_src']
2056 video_duration = int(params['video_duration'])
2058 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2060 raise ExtractorError(u'Cannot find title in webpage')
2061 video_title = unescapeHTML(m.group(1))
2065 'title': video_title,
2068 'duration': video_duration,
2069 'thumbnail': params['thumbnail_src'],
2074 class BlipTVIE(InfoExtractor):
2075 """Information extractor for blip.tv"""
2077 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2078 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2079 IE_NAME = u'blip.tv'
2081 def report_extraction(self, file_id):
2082 """Report information extraction."""
2083 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2085 def report_direct_download(self, title):
2086 """Report information extraction."""
2087 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2089 def _real_extract(self, url):
2090 mobj = re.match(self._VALID_URL, url)
2092 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2099 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2100 request = compat_urllib_request.Request(json_url)
2101 request.add_header('User-Agent', 'iTunes/10.6.1')
2102 self.report_extraction(mobj.group(1))
2105 urlh = compat_urllib_request.urlopen(request)
2106 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2107 basename = url.split('/')[-1]
2108 title,ext = os.path.splitext(basename)
2109 title = title.decode('UTF-8')
2110 ext = ext.replace('.', '')
2111 self.report_direct_download(title)
2116 'upload_date': None,
2121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2122 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2123 if info is None: # Regular URL
2125 json_code_bytes = urlh.read()
2126 json_code = json_code_bytes.decode('utf-8')
2127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2128 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2132 json_data = json.loads(json_code)
2133 if 'Post' in json_data:
2134 data = json_data['Post']
2138 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2139 video_url = data['media']['url']
2140 umobj = re.match(self._URL_EXT, video_url)
2142 raise ValueError('Can not determine filename extension')
2143 ext = umobj.group(1)
2146 'id': data['item_id'],
2148 'uploader': data['display_name'],
2149 'upload_date': upload_date,
2150 'title': data['title'],
2152 'format': data['media']['mimeType'],
2153 'thumbnail': data['thumbnailUrl'],
2154 'description': data['description'],
2155 'player_url': data['embedUrl'],
2156 'user_agent': 'iTunes/10.6.1',
2158 except (ValueError,KeyError) as err:
2159 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2165 class MyVideoIE(InfoExtractor):
2166 """Information Extractor for myvideo.de."""
2168 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2169 IE_NAME = u'myvideo'
2171 def __init__(self, downloader=None):
2172 InfoExtractor.__init__(self, downloader)
2174 def report_extraction(self, video_id):
2175 """Report information extraction."""
2176 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2178 def _real_extract(self,url):
2179 mobj = re.match(self._VALID_URL, url)
2181 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2184 video_id = mobj.group(1)
2187 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2188 webpage = self._download_webpage(webpage_url, video_id)
2190 self.report_extraction(video_id)
2191 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2194 self._downloader.trouble(u'ERROR: unable to extract media URL')
2196 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2198 mobj = re.search('<title>([^<]+)</title>', webpage)
2200 self._downloader.trouble(u'ERROR: unable to extract title')
2203 video_title = mobj.group(1)
2209 'upload_date': None,
2210 'title': video_title,
2214 class ComedyCentralIE(InfoExtractor):
2215 """Information extractor for The Daily Show and Colbert Report """
2217 # urls can be abbreviations like :thedailyshow or :colbert
2218 # urls for episodes like:
2219 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2220 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2221 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2222 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2223 |(https?://)?(www\.)?
2224 (?P<showname>thedailyshow|colbertnation)\.com/
2225 (full-episodes/(?P<episode>.*)|
2227 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2228 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2231 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2233 _video_extensions = {
2241 _video_dimensions = {
2250 def suitable(self, url):
2251 """Receives a URL and returns True if suitable for this IE."""
2252 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2254 def report_extraction(self, episode_id):
2255 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2257 def report_config_download(self, episode_id, media_id):
2258 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2260 def report_index_download(self, episode_id):
2261 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2263 def _print_formats(self, formats):
2264 print('Available formats:')
2266 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2269 def _real_extract(self, url):
2270 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2272 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2275 if mobj.group('shortname'):
2276 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2277 url = u'http://www.thedailyshow.com/full-episodes/'
2279 url = u'http://www.colbertnation.com/full-episodes/'
2280 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2281 assert mobj is not None
2283 if mobj.group('clip'):
2284 if mobj.group('showname') == 'thedailyshow':
2285 epTitle = mobj.group('tdstitle')
2287 epTitle = mobj.group('cntitle')
2290 dlNewest = not mobj.group('episode')
2292 epTitle = mobj.group('showname')
2294 epTitle = mobj.group('episode')
2296 req = compat_urllib_request.Request(url)
2297 self.report_extraction(epTitle)
2299 htmlHandle = compat_urllib_request.urlopen(req)
2300 html = htmlHandle.read()
2301 webpage = html.decode('utf-8')
2302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2303 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2306 url = htmlHandle.geturl()
2307 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2309 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2311 if mobj.group('episode') == '':
2312 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2314 epTitle = mobj.group('episode')
2316 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2318 if len(mMovieParams) == 0:
2319 # The Colbert Report embeds the information in a without
2320 # a URL prefix; so extract the alternate reference
2321 # and then add the URL prefix manually.
2323 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2324 if len(altMovieParams) == 0:
2325 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2328 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2330 uri = mMovieParams[0][1]
2331 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2332 self.report_index_download(epTitle)
2334 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2335 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2336 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2341 idoc = xml.etree.ElementTree.fromstring(indexXml)
2342 itemEls = idoc.findall('.//item')
2343 for partNum,itemEl in enumerate(itemEls):
2344 mediaId = itemEl.findall('./guid')[0].text
2345 shortMediaId = mediaId.split(':')[-1]
2346 showId = mediaId.split(':')[-2].replace('.com', '')
2347 officialTitle = itemEl.findall('./title')[0].text
2348 officialDate = itemEl.findall('./pubDate')[0].text
2350 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2351 compat_urllib_parse.urlencode({'uri': mediaId}))
2352 configReq = compat_urllib_request.Request(configUrl)
2353 self.report_config_download(epTitle, shortMediaId)
2355 configXml = compat_urllib_request.urlopen(configReq).read()
2356 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2357 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2360 cdoc = xml.etree.ElementTree.fromstring(configXml)
2362 for rendition in cdoc.findall('.//rendition'):
2363 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2367 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2370 if self._downloader.params.get('listformats', None):
2371 self._print_formats([i[0] for i in turls])
2374 # For now, just pick the highest bitrate
2375 format,rtmp_video_url = turls[-1]
2377 # Get the format arg from the arg stream
2378 req_format = self._downloader.params.get('format', None)
2380 # Select format if we can find one
2383 format, rtmp_video_url = f, v
2386 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2388 raise ExtractorError(u'Cannot transform RTMP url')
2389 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2390 video_url = base + m.group('finalid')
2392 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2397 'upload_date': officialDate,
2402 'description': officialTitle,
2404 results.append(info)
2409 class EscapistIE(InfoExtractor):
2410 """Information extractor for The Escapist """
2412 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2413 IE_NAME = u'escapist'
2415 def report_extraction(self, showName):
2416 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2418 def report_config_download(self, showName):
2419 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2421 def _real_extract(self, url):
2422 mobj = re.match(self._VALID_URL, url)
2424 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2426 showName = mobj.group('showname')
2427 videoId = mobj.group('episode')
2429 self.report_extraction(showName)
2431 webPage = compat_urllib_request.urlopen(url)
2432 webPageBytes = webPage.read()
2433 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2434 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2435 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2436 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2439 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2440 description = unescapeHTML(descMatch.group(1))
2441 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2442 imgUrl = unescapeHTML(imgMatch.group(1))
2443 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2444 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2445 configUrlMatch = re.search('config=(.*)$', playerUrl)
2446 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2448 self.report_config_download(showName)
2450 configJSON = compat_urllib_request.urlopen(configUrl)
2451 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2452 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2457 # Technically, it's JavaScript, not JSON
2458 configJSON = configJSON.replace("'", '"')
2461 config = json.loads(configJSON)
2462 except (ValueError,) as err:
2463 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2466 playlist = config['playlist']
2467 videoUrl = playlist[1]['url']
2472 'uploader': showName,
2473 'upload_date': None,
2476 'thumbnail': imgUrl,
2477 'description': description,
2478 'player_url': playerUrl,
2483 class CollegeHumorIE(InfoExtractor):
2484 """Information extractor for collegehumor.com"""
2487 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2488 IE_NAME = u'collegehumor'
2490 def report_manifest(self, video_id):
2491 """Report information extraction."""
2492 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2494 def report_extraction(self, video_id):
2495 """Report information extraction."""
2496 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2498 def _real_extract(self, url):
2499 mobj = re.match(self._VALID_URL, url)
2501 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2503 video_id = mobj.group('videoid')
2508 'upload_date': None,
2511 self.report_extraction(video_id)
2512 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2514 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2515 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2516 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2519 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2521 videoNode = mdoc.findall('./video')[0]
2522 info['description'] = videoNode.findall('./description')[0].text
2523 info['title'] = videoNode.findall('./caption')[0].text
2524 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2525 manifest_url = videoNode.findall('./file')[0].text
2527 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2530 manifest_url += '?hdcore=2.10.3'
2531 self.report_manifest(video_id)
2533 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2534 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2535 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2538 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2540 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2541 node_id = media_node.attrib['url']
2542 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2543 except IndexError as err:
2544 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2547 url_pr = compat_urllib_parse_urlparse(manifest_url)
2548 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2555 class XVideosIE(InfoExtractor):
2556 """Information extractor for xvideos.com"""
2558 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2559 IE_NAME = u'xvideos'
2561 def report_extraction(self, video_id):
2562 """Report information extraction."""
2563 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2565 def _real_extract(self, url):
2566 mobj = re.match(self._VALID_URL, url)
2568 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2570 video_id = mobj.group(1)
2572 webpage = self._download_webpage(url, video_id)
2574 self.report_extraction(video_id)
2578 mobj = re.search(r'flv_url=(.+?)&', webpage)
2580 self._downloader.trouble(u'ERROR: unable to extract video url')
2582 video_url = compat_urllib_parse.unquote(mobj.group(1))
2586 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2588 self._downloader.trouble(u'ERROR: unable to extract video title')
2590 video_title = mobj.group(1)
2593 # Extract video thumbnail
2594 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2596 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2598 video_thumbnail = mobj.group(0)
2604 'upload_date': None,
2605 'title': video_title,
2607 'thumbnail': video_thumbnail,
2608 'description': None,
2614 class SoundcloudIE(InfoExtractor):
2615 """Information extractor for soundcloud.com
2616 To access the media, the uid of the song and a stream token
2617 must be extracted from the page source and the script must make
2618 a request to media.soundcloud.com/crossdomain.xml. Then
2619 the media can be grabbed by requesting from an url composed
2620 of the stream token and uid
2623 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2624 IE_NAME = u'soundcloud'
2626 def __init__(self, downloader=None):
2627 InfoExtractor.__init__(self, downloader)
2629 def report_resolve(self, video_id):
2630 """Report information extraction."""
2631 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2633 def report_extraction(self, video_id):
2634 """Report information extraction."""
2635 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2637 def _real_extract(self, url):
2638 mobj = re.match(self._VALID_URL, url)
2640 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2643 # extract uploader (which is in the url)
2644 uploader = mobj.group(1)
2645 # extract simple title (uploader + slug of song title)
2646 slug_title = mobj.group(2)
2647 simple_title = uploader + u'-' + slug_title
2649 self.report_resolve('%s/%s' % (uploader, slug_title))
2651 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2652 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2653 request = compat_urllib_request.Request(resolv_url)
2655 info_json_bytes = compat_urllib_request.urlopen(request).read()
2656 info_json = info_json_bytes.decode('utf-8')
2657 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2658 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2661 info = json.loads(info_json)
2662 video_id = info['id']
2663 self.report_extraction('%s/%s' % (uploader, slug_title))
2665 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2666 request = compat_urllib_request.Request(streams_url)
2668 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2669 stream_json = stream_json_bytes.decode('utf-8')
2670 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2671 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2674 streams = json.loads(stream_json)
2675 mediaURL = streams['http_mp3_128_url']
2680 'uploader': info['user']['username'],
2681 'upload_date': info['created_at'],
2682 'title': info['title'],
2684 'description': info['description'],
2688 class InfoQIE(InfoExtractor):
2689 """Information extractor for infoq.com"""
2690 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2692 def report_extraction(self, video_id):
2693 """Report information extraction."""
2694 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2696 def _real_extract(self, url):
2697 mobj = re.match(self._VALID_URL, url)
2699 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2702 webpage = self._download_webpage(url, video_id=url)
2703 self.report_extraction(url)
2706 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2708 self._downloader.trouble(u'ERROR: unable to extract video url')
2710 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2711 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2714 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2716 self._downloader.trouble(u'ERROR: unable to extract video title')
2718 video_title = mobj.group(1)
2720 # Extract description
2721 video_description = u'No description available.'
2722 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2723 if mobj is not None:
2724 video_description = mobj.group(1)
2726 video_filename = video_url.split('/')[-1]
2727 video_id, extension = video_filename.split('.')
2733 'upload_date': None,
2734 'title': video_title,
2735 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2737 'description': video_description,
2742 class MixcloudIE(InfoExtractor):
2743 """Information extractor for www.mixcloud.com"""
2745 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2746 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2747 IE_NAME = u'mixcloud'
2749 def __init__(self, downloader=None):
2750 InfoExtractor.__init__(self, downloader)
2752 def report_download_json(self, file_id):
2753 """Report JSON download."""
2754 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2756 def report_extraction(self, file_id):
2757 """Report information extraction."""
2758 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2760 def get_urls(self, jsonData, fmt, bitrate='best'):
2761 """Get urls from 'audio_formats' section in json"""
2764 bitrate_list = jsonData[fmt]
2765 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2766 bitrate = max(bitrate_list) # select highest
2768 url_list = jsonData[fmt][bitrate]
2769 except TypeError: # we have no bitrate info.
2770 url_list = jsonData[fmt]
2773 def check_urls(self, url_list):
2774 """Returns 1st active url from list"""
2775 for url in url_list:
2777 compat_urllib_request.urlopen(url)
2779 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2784 def _print_formats(self, formats):
2785 print('Available formats:')
2786 for fmt in formats.keys():
2787 for b in formats[fmt]:
2789 ext = formats[fmt][b][0]
2790 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2791 except TypeError: # we have no bitrate info
2792 ext = formats[fmt][0]
2793 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2796 def _real_extract(self, url):
2797 mobj = re.match(self._VALID_URL, url)
2799 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2801 # extract uploader & filename from url
2802 uploader = mobj.group(1).decode('utf-8')
2803 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2805 # construct API request
2806 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2807 # retrieve .json file with links to files
2808 request = compat_urllib_request.Request(file_url)
2810 self.report_download_json(file_url)
2811 jsonData = compat_urllib_request.urlopen(request).read()
2812 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2817 json_data = json.loads(jsonData)
2818 player_url = json_data['player_swf_url']
2819 formats = dict(json_data['audio_formats'])
2821 req_format = self._downloader.params.get('format', None)
2824 if self._downloader.params.get('listformats', None):
2825 self._print_formats(formats)
2828 if req_format is None or req_format == 'best':
2829 for format_param in formats.keys():
2830 url_list = self.get_urls(formats, format_param)
2832 file_url = self.check_urls(url_list)
2833 if file_url is not None:
2836 if req_format not in formats:
2837 self._downloader.trouble(u'ERROR: format is not available')
2840 url_list = self.get_urls(formats, req_format)
2841 file_url = self.check_urls(url_list)
2842 format_param = req_format
2845 'id': file_id.decode('utf-8'),
2846 'url': file_url.decode('utf-8'),
2847 'uploader': uploader.decode('utf-8'),
2848 'upload_date': None,
2849 'title': json_data['name'],
2850 'ext': file_url.split('.')[-1].decode('utf-8'),
2851 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2852 'thumbnail': json_data['thumbnail_url'],
2853 'description': json_data['description'],
2854 'player_url': player_url.decode('utf-8'),
2857 class StanfordOpenClassroomIE(InfoExtractor):
2858 """Information extractor for Stanford's Open ClassRoom"""
2860 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2861 IE_NAME = u'stanfordoc'
2863 def report_download_webpage(self, objid):
2864 """Report information extraction."""
2865 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2867 def report_extraction(self, video_id):
2868 """Report information extraction."""
2869 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2871 def _real_extract(self, url):
2872 mobj = re.match(self._VALID_URL, url)
2874 raise ExtractorError(u'Invalid URL: %s' % url)
2876 if mobj.group('course') and mobj.group('video'): # A specific video
2877 course = mobj.group('course')
2878 video = mobj.group('video')
2880 'id': course + '_' + video,
2882 'upload_date': None,
2885 self.report_extraction(info['id'])
2886 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2887 xmlUrl = baseUrl + video + '.xml'
2889 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2891 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2893 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2895 info['title'] = mdoc.findall('./title')[0].text
2896 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2898 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2900 info['ext'] = info['url'].rpartition('.')[2]
2902 elif mobj.group('course'): # A course page
2903 course = mobj.group('course')
2908 'upload_date': None,
2911 coursepage = self._download_webpage(url, info['id'],
2912 note='Downloading course info page',
2913 errnote='Unable to download course info page')
2915 m = re.search('<h1>([^<]+)</h1>', coursepage)
2917 info['title'] = unescapeHTML(m.group(1))
2919 info['title'] = info['id']
2921 m = re.search('<description>([^<]+)</description>', coursepage)
2923 info['description'] = unescapeHTML(m.group(1))
2925 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2928 'type': 'reference',
2929 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2933 for entry in info['list']:
2934 assert entry['type'] == 'reference'
2935 results += self.extract(entry['url'])
2939 'id': 'Stanford OpenClassroom',
2942 'upload_date': None,
2945 self.report_download_webpage(info['id'])
2946 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2948 rootpage = compat_urllib_request.urlopen(rootURL).read()
2949 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2950 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2953 info['title'] = info['id']
2955 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2958 'type': 'reference',
2959 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2964 for entry in info['list']:
2965 assert entry['type'] == 'reference'
2966 results += self.extract(entry['url'])
2969 class MTVIE(InfoExtractor):
2970 """Information extractor for MTV.com"""
2972 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2975 def report_extraction(self, video_id):
2976 """Report information extraction."""
2977 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2979 def _real_extract(self, url):
2980 mobj = re.match(self._VALID_URL, url)
2982 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2984 if not mobj.group('proto'):
2985 url = 'http://' + url
2986 video_id = mobj.group('videoid')
2988 webpage = self._download_webpage(url, video_id)
2990 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2992 self._downloader.trouble(u'ERROR: unable to extract song name')
2994 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2995 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2997 self._downloader.trouble(u'ERROR: unable to extract performer')
2999 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3000 video_title = performer + ' - ' + song_name
3002 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3004 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3006 mtvn_uri = mobj.group(1)
3008 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3010 self._downloader.trouble(u'ERROR: unable to extract content id')
3012 content_id = mobj.group(1)
3014 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3015 self.report_extraction(video_id)
3016 request = compat_urllib_request.Request(videogen_url)
3018 metadataXml = compat_urllib_request.urlopen(request).read()
3019 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3020 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3023 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3024 renditions = mdoc.findall('.//rendition')
3026 # For now, always pick the highest quality.
3027 rendition = renditions[-1]
3030 _,_,ext = rendition.attrib['type'].partition('/')
3031 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3032 video_url = rendition.find('./src').text
3034 self._downloader.trouble('Invalid rendition field.')
3040 'uploader': performer,
3041 'upload_date': None,
3042 'title': video_title,
3050 class YoukuIE(InfoExtractor):
3051 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3053 def report_download_webpage(self, file_id):
3054 """Report webpage download."""
3055 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3057 def report_extraction(self, file_id):
3058 """Report information extraction."""
3059 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3062 nowTime = int(time.time() * 1000)
3063 random1 = random.randint(1000,1998)
3064 random2 = random.randint(1000,9999)
3066 return "%d%d%d" %(nowTime,random1,random2)
3068 def _get_file_ID_mix_string(self, seed):
3070 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3072 for i in range(len(source)):
3073 seed = (seed * 211 + 30031 ) % 65536
3074 index = math.floor(seed / 65536 * len(source) )
3075 mixed.append(source[int(index)])
3076 source.remove(source[int(index)])
3077 #return ''.join(mixed)
3080 def _get_file_id(self, fileId, seed):
3081 mixed = self._get_file_ID_mix_string(seed)
3082 ids = fileId.split('*')
3086 realId.append(mixed[int(ch)])
3087 return ''.join(realId)
3089 def _real_extract(self, url):
3090 mobj = re.match(self._VALID_URL, url)
3092 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3094 video_id = mobj.group('ID')
3096 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3098 request = compat_urllib_request.Request(info_url, None, std_headers)
3100 self.report_download_webpage(video_id)
3101 jsondata = compat_urllib_request.urlopen(request).read()
3102 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3106 self.report_extraction(video_id)
3108 jsonstr = jsondata.decode('utf-8')
3109 config = json.loads(jsonstr)
3111 video_title = config['data'][0]['title']
3112 seed = config['data'][0]['seed']
3114 format = self._downloader.params.get('format', None)
3115 supported_format = list(config['data'][0]['streamfileids'].keys())
3117 if format is None or format == 'best':
3118 if 'hd2' in supported_format:
3123 elif format == 'worst':
3131 fileid = config['data'][0]['streamfileids'][format]
3132 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3133 except (UnicodeDecodeError, ValueError, KeyError):
3134 self._downloader.trouble(u'ERROR: unable to extract info section')
3138 sid = self._gen_sid()
3139 fileid = self._get_file_id(fileid, seed)
3141 #column 8,9 of fileid represent the segment number
3142 #fileid[7:9] should be changed
3143 for index, key in enumerate(keys):
3145 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3146 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3149 'id': '%s_part%02d' % (video_id, index),
3150 'url': download_url,
3152 'upload_date': None,
3153 'title': video_title,
3156 files_info.append(info)
3161 class XNXXIE(InfoExtractor):
3162 """Information extractor for xnxx.com"""
3164 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3166 VIDEO_URL_RE = r'flv_url=(.*?)&'
3167 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3168 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3170 def report_webpage(self, video_id):
3171 """Report information extraction"""
3172 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3174 def report_extraction(self, video_id):
3175 """Report information extraction"""
3176 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3178 def _real_extract(self, url):
3179 mobj = re.match(self._VALID_URL, url)
3181 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3183 video_id = mobj.group(1)
3185 self.report_webpage(video_id)
3187 # Get webpage content
3189 webpage_bytes = compat_urllib_request.urlopen(url).read()
3190 webpage = webpage_bytes.decode('utf-8')
3191 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3192 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3195 result = re.search(self.VIDEO_URL_RE, webpage)
3197 self._downloader.trouble(u'ERROR: unable to extract video url')
3199 video_url = compat_urllib_parse.unquote(result.group(1))
3201 result = re.search(self.VIDEO_TITLE_RE, webpage)
3203 self._downloader.trouble(u'ERROR: unable to extract video title')
3205 video_title = result.group(1)
3207 result = re.search(self.VIDEO_THUMB_RE, webpage)
3209 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3211 video_thumbnail = result.group(1)
3217 'upload_date': None,
3218 'title': video_title,
3220 'thumbnail': video_thumbnail,
3221 'description': None,
3225 class GooglePlusIE(InfoExtractor):
3226 """Information extractor for plus.google.com."""
3228 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3229 IE_NAME = u'plus.google'
3231 def __init__(self, downloader=None):
3232 InfoExtractor.__init__(self, downloader)
3234 def report_extract_entry(self, url):
3235 """Report downloading extry"""
3236 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3238 def report_date(self, upload_date):
3239 """Report downloading extry"""
3240 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3242 def report_uploader(self, uploader):
3243 """Report downloading extry"""
3244 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3246 def report_title(self, video_title):
3247 """Report downloading extry"""
3248 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3250 def report_extract_vid_page(self, video_page):
3251 """Report information extraction."""
3252 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3254 def _real_extract(self, url):
3255 # Extract id from URL
3256 mobj = re.match(self._VALID_URL, url)
3258 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3261 post_url = mobj.group(0)
3262 video_id = mobj.group(1)
3264 video_extension = 'flv'
3266 # Step 1, Retrieve post webpage to extract further information
3267 self.report_extract_entry(post_url)
3268 request = compat_urllib_request.Request(post_url)
3270 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3271 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3272 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3275 # Extract update date
3277 pattern = 'title="Timestamp">(.*?)</a>'
3278 mobj = re.search(pattern, webpage)
3280 upload_date = mobj.group(1)
3281 # Convert timestring to a format suitable for filename
3282 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3283 upload_date = upload_date.strftime('%Y%m%d')
3284 self.report_date(upload_date)
3288 pattern = r'rel\="author".*?>(.*?)</a>'
3289 mobj = re.search(pattern, webpage)
3291 uploader = mobj.group(1)
3292 self.report_uploader(uploader)
3295 # Get the first line for title
3297 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3298 mobj = re.search(pattern, webpage)
3300 video_title = mobj.group(1)
3301 self.report_title(video_title)
3303 # Step 2, Stimulate clicking the image box to launch video
3304 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3305 mobj = re.search(pattern, webpage)
3307 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3309 video_page = mobj.group(1)
3310 request = compat_urllib_request.Request(video_page)
3312 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3313 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3314 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3316 self.report_extract_vid_page(video_page)
3319 # Extract video links on video page
3320 """Extract video links of all sizes"""
3321 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3322 mobj = re.findall(pattern, webpage)
3324 self._downloader.trouble(u'ERROR: unable to extract video links')
3326 # Sort in resolution
3327 links = sorted(mobj)
3329 # Choose the lowest of the sort, i.e. highest resolution
3330 video_url = links[-1]
3331 # Only get the url. The resolution part in the tuple has no use anymore
3332 video_url = video_url[-1]
3333 # Treat escaped \u0026 style hex
3335 video_url = video_url.decode("unicode_escape")
3336 except AttributeError: # Python 3
3337 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3343 'uploader': uploader,
3344 'upload_date': upload_date,
3345 'title': video_title,
3346 'ext': video_extension,
3349 class NBAIE(InfoExtractor):
3350 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3353 def _real_extract(self, url):
3354 mobj = re.match(self._VALID_URL, url)
3356 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3359 video_id = mobj.group(1)
3360 if video_id.endswith('/index.html'):
3361 video_id = video_id[:-len('/index.html')]
3363 webpage = self._download_webpage(url, video_id)
3365 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3366 def _findProp(rexp, default=None):
3367 m = re.search(rexp, webpage)
3369 return unescapeHTML(m.group(1))
3373 shortened_video_id = video_id.rpartition('/')[2]
3374 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3376 'id': shortened_video_id,
3380 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3381 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3385 class JustinTVIE(InfoExtractor):
3386 """Information extractor for justin.tv and twitch.tv"""
3387 # TODO: One broadcast may be split into multiple videos. The key
3388 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3389 # starts at 1 and increases. Can we treat all parts as one video?
3391 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3392 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3393 _JUSTIN_PAGE_LIMIT = 100
3394 IE_NAME = u'justin.tv'
3396 def report_extraction(self, file_id):
3397 """Report information extraction."""
3398 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3400 def report_download_page(self, channel, offset):
3401 """Report attempt to download a single page of videos."""
3402 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3403 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3405 # Return count of items, list of *valid* items
3406 def _parse_page(self, url):
3408 urlh = compat_urllib_request.urlopen(url)
3409 webpage_bytes = urlh.read()
3410 webpage = webpage_bytes.decode('utf-8', 'ignore')
3411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3412 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3415 response = json.loads(webpage)
3416 if type(response) != list:
3417 error_text = response.get('error', 'unknown error')
3418 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3421 for clip in response:
3422 video_url = clip['video_file_url']
3424 video_extension = os.path.splitext(video_url)[1][1:]
3425 video_date = re.sub('-', '', clip['start_time'][:10])
3426 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3427 video_id = clip['id']
3428 video_title = clip.get('title', video_id)
3432 'title': video_title,
3433 'uploader': clip.get('channel_name', video_uploader_id),
3434 'uploader_id': video_uploader_id,
3435 'upload_date': video_date,
3436 'ext': video_extension,
3438 return (len(response), info)
3440 def _real_extract(self, url):
3441 mobj = re.match(self._VALID_URL, url)
3443 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3446 api = 'http://api.justin.tv'
3447 video_id = mobj.group(mobj.lastindex)
3449 if mobj.lastindex == 1:
3451 api += '/channel/archives/%s.json'
3453 api += '/broadcast/by_archive/%s.json'
3454 api = api % (video_id,)
3456 self.report_extraction(video_id)
3460 limit = self._JUSTIN_PAGE_LIMIT
3463 self.report_download_page(video_id, offset)
3464 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3465 page_count, page_info = self._parse_page(page_url)
3466 info.extend(page_info)
3467 if not paged or page_count != limit:
3472 class FunnyOrDieIE(InfoExtractor):
3473 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3475 def _real_extract(self, url):
3476 mobj = re.match(self._VALID_URL, url)
3478 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3481 video_id = mobj.group('id')
3482 webpage = self._download_webpage(url, video_id)
3484 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3486 self._downloader.trouble(u'ERROR: unable to find video information')
3487 video_url = unescapeHTML(m.group('url'))
3489 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3491 self._downloader.trouble(u'Cannot find video title')
3492 title = unescapeHTML(m.group('title'))
3494 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3496 desc = unescapeHTML(m.group('desc'))
3505 'description': desc,
3509 class TweetReelIE(InfoExtractor):
3510 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3512 def _real_extract(self, url):
3513 mobj = re.match(self._VALID_URL, url)
3515 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3518 video_id = mobj.group('id')
3519 webpage = self._download_webpage(url, video_id)
3521 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3523 self._downloader.trouble(u'ERROR: Cannot find status ID')
3524 status_id = m.group(1)
3526 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3528 self._downloader.trouble(u'WARNING: Cannot find description')
3529 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3531 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3533 self._downloader.trouble(u'ERROR: Cannot find uploader')
3534 uploader = unescapeHTML(m.group('uploader'))
3535 uploader_id = unescapeHTML(m.group('uploader_id'))
3537 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3539 self._downloader.trouble(u'ERROR: Cannot find upload date')
3540 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3543 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3550 'description': desc,
3551 'uploader': uploader,
3552 'uploader_id': uploader_id,
3553 'internal_id': status_id,
3554 'upload_date': upload_date
3558 class SteamIE(InfoExtractor):
3559 _VALID_URL = r"""http://store.steampowered.com/
3560 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3562 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3565 def suitable(self, url):
3566 """Receives a URL and returns True if suitable for this IE."""
3567 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3569 def _real_extract(self, url):
3570 m = re.match(self._VALID_URL, url, re.VERBOSE)
3571 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3572 gameID = m.group('gameID')
3573 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3574 webpage = self._download_webpage(videourl, gameID)
3575 mweb = re.finditer(urlRE, webpage)
3576 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3577 titles = re.finditer(namesRE, webpage)
3579 for vid,vtitle in zip(mweb,titles):
3580 video_id = vid.group('videoID')
3581 title = vtitle.group('videoName')
3582 video_url = vid.group('videoURL')
3584 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3589 'title': unescapeHTML(title)
3594 class UstreamIE(InfoExtractor):
3595 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3596 IE_NAME = u'ustream'
3598 def _real_extract(self, url):
3599 m = re.match(self._VALID_URL, url)
3600 video_id = m.group('videoID')
3601 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3602 webpage = self._download_webpage(url, video_id)
3603 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3604 title = m.group('title')
3605 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3606 uploader = m.group('uploader')
3612 'uploader': uploader
3616 class RBMARadioIE(InfoExtractor):
3617 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3619 def _real_extract(self, url):
3620 m = re.match(self._VALID_URL, url)
3621 video_id = m.group('videoID')
3623 webpage = self._download_webpage(url, video_id)
3624 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3626 raise ExtractorError(u'Cannot find metadata')
3627 json_data = m.group(1)
3630 data = json.loads(json_data)
3631 except ValueError as e:
3632 raise ExtractorError(u'Invalid JSON: ' + str(e))
3634 video_url = data['akamai_url'] + '&cbr=256'
3635 url_parts = compat_urllib_parse_urlparse(video_url)
3636 video_ext = url_parts.path.rpartition('.')[2]
3641 'title': data['title'],
3642 'description': data.get('teaser_text'),
3643 'location': data.get('country_of_origin'),
3644 'uploader': data.get('host', {}).get('name'),
3645 'uploader_id': data.get('host', {}).get('slug'),
3646 'thumbnail': data.get('image', {}).get('large_url_2x'),
3647 'duration': data.get('duration'),
3652 class YouPornIE(InfoExtractor):
3653 """Information extractor for youporn.com."""
3654 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3656 def _print_formats(self, formats):
3657 """Print all available formats"""
3658 print(u'Available formats:')
3659 print(u'ext\t\tformat')
3660 print(u'---------------------------------')
3661 for format in formats:
3662 print(u'%s\t\t%s' % (format['ext'], format['format']))
3664 def _specific(self, req_format, formats):
3666 if(x["format"]==req_format):
3670 def _real_extract(self, url):
3671 mobj = re.match(self._VALID_URL, url)
3673 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3676 video_id = mobj.group('videoid')
3678 req = compat_urllib_request.Request(url)
3679 req.add_header('Cookie', 'age_verified=1')
3680 webpage = self._download_webpage(req, video_id)
3682 # Get the video title
3683 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3685 raise ExtractorError(u'ERROR: unable to extract video title')
3686 video_title = result.group('title').strip()
3688 # Get the video date
3689 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3691 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3694 upload_date = result.group('date').strip()
3696 # Get the video uploader
3697 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3699 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3700 video_uploader = None
3702 video_uploader = result.group('uploader').strip()
3703 video_uploader = clean_html( video_uploader )
3705 # Get all of the formats available
3706 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3707 result = re.search(DOWNLOAD_LIST_RE, webpage)
3709 raise ExtractorError(u'Unable to extract download list')
3710 download_list_html = result.group('download_list').strip()
3712 # Get all of the links from the page
3713 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3714 links = re.findall(LINK_RE, download_list_html)
3715 if(len(links) == 0):
3716 raise ExtractorError(u'ERROR: no known formats available for video')
3718 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3723 # A link looks like this:
3724 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3725 # A path looks like this:
3726 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3727 video_url = unescapeHTML( link )
3728 path = compat_urllib_parse_urlparse( video_url ).path
3729 extension = os.path.splitext( path )[1][1:]
3730 format = path.split('/')[4].split('_')[:2]
3733 format = "-".join( format )
3734 title = u'%s-%s-%s' % (video_title, size, bitrate)
3739 'uploader': video_uploader,
3740 'upload_date': upload_date,
3745 'description': None,
3749 if self._downloader.params.get('listformats', None):
3750 self._print_formats(formats)
3753 req_format = self._downloader.params.get('format', None)
3754 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3756 if req_format is None or req_format == 'best':
3758 elif req_format == 'worst':
3759 return [formats[-1]]
3760 elif req_format in ('-1', 'all'):
3763 format = self._specific( req_format, formats )
3765 self._downloader.trouble(u'ERROR: requested format not available')
3771 class PornotubeIE(InfoExtractor):
3772 """Information extractor for pornotube.com."""
3773 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3775 def _real_extract(self, url):
3776 mobj = re.match(self._VALID_URL, url)
3778 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3781 video_id = mobj.group('videoid')
3782 video_title = mobj.group('title')
3784 # Get webpage content
3785 webpage = self._download_webpage(url, video_id)
3788 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3789 result = re.search(VIDEO_URL_RE, webpage)
3791 self._downloader.trouble(u'ERROR: unable to extract video url')
3793 video_url = compat_urllib_parse.unquote(result.group('url'))
3795 #Get the uploaded date
3796 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3797 result = re.search(VIDEO_UPLOADED_RE, webpage)
3799 self._downloader.trouble(u'ERROR: unable to extract video title')
3801 upload_date = result.group('date')
3803 info = {'id': video_id,
3806 'upload_date': upload_date,
3807 'title': video_title,
3813 class YouJizzIE(InfoExtractor):
3814 """Information extractor for youjizz.com."""
3815 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3817 def _real_extract(self, url):
3818 mobj = re.match(self._VALID_URL, url)
3820 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3823 video_id = mobj.group('videoid')
3825 # Get webpage content
3826 webpage = self._download_webpage(url, video_id)
3828 # Get the video title
3829 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3831 raise ExtractorError(u'ERROR: unable to extract video title')
3832 video_title = result.group('title').strip()
3834 # Get the embed page
3835 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3837 raise ExtractorError(u'ERROR: unable to extract embed page')
3839 embed_page_url = result.group(0).strip()
3840 video_id = result.group('videoid')
3842 webpage = self._download_webpage(embed_page_url, video_id)
3845 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3847 raise ExtractorError(u'ERROR: unable to extract video url')
3848 video_url = result.group('source')
3850 info = {'id': video_id,
3852 'title': video_title,
3855 'player_url': embed_page_url}
3859 class EightTracksIE(InfoExtractor):
3861 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3863 def _real_extract(self, url):
3864 mobj = re.match(self._VALID_URL, url)
3866 raise ExtractorError(u'Invalid URL: %s' % url)
3867 playlist_id = mobj.group('id')
3869 webpage = self._download_webpage(url, playlist_id)
3871 m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3873 raise ExtractorError(u'Cannot find trax information')
3874 json_like = m.group(1)
3875 data = json.loads(json_like)
3877 session = str(random.randint(0, 1000000000))
3879 track_count = data['tracks_count']
3880 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3881 next_url = first_url
3883 for i in itertools.count():
3884 api_json = self._download_webpage(next_url, playlist_id,
3885 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3886 errnote=u'Failed to download song information')
3887 api_data = json.loads(api_json)
3888 track_data = api_data[u'set']['track']
3890 'id': track_data['id'],
3891 'url': track_data['track_file_stream_url'],
3892 'title': track_data['performer'] + u' - ' + track_data['name'],
3893 'raw_title': track_data['name'],
3894 'uploader_id': data['user']['login'],
3898 if api_data['set']['at_last_track']:
3900 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3903 def gen_extractors():
3904 """ Return a list of an instance of every supported extractor.
3905 The order does matter; the first extractor matched is the one handling the URL.
3908 YoutubePlaylistIE(),
3932 StanfordOpenClassroomIE(),