2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
272 url = 'http://www.youtube.com/api/timedtext?' + params
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
281 def _print_formats(self, formats):
282 print('Available formats:')
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
286 def _real_initialize(self):
287 if self._downloader is None:
292 downloader_params = self._downloader.params
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
311 request = compat_urllib_request.Request(self._LANG_URL)
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
319 # No authentication to be performed
325 'current_form': 'loginForm',
327 'action_login': 'Log In',
328 'username': username,
329 'password': password,
331 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
334 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
335 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
336 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
338 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
339 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
345 'action_confirm': 'Confirm',
347 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
349 self.report_age_confirmation()
350 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
352 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
355 def _extract_id(self, url):
356 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
358 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
360 video_id = mobj.group(2)
363 def _real_extract(self, url):
364 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
365 mobj = re.search(self._NEXT_URL_RE, url)
367 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
368 video_id = self._extract_id(url)
371 self.report_video_webpage_download(video_id)
372 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
373 request = compat_urllib_request.Request(url)
375 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
380 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
382 # Attempt to extract SWF player URL
383 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
385 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
390 self.report_video_info_webpage_download(video_id)
391 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
392 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
393 % (video_id, el_type))
394 request = compat_urllib_request.Request(video_info_url)
396 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
397 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
398 video_info = compat_parse_qs(video_info_webpage)
399 if 'token' in video_info:
401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
402 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
404 if 'token' not in video_info:
405 if 'reason' in video_info:
406 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
408 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
411 # Check for "rental" videos
412 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
413 self._downloader.trouble(u'ERROR: "rental" videos not supported')
416 # Start extracting information
417 self.report_information_extraction(video_id)
420 if 'author' not in video_info:
421 self._downloader.trouble(u'ERROR: unable to extract uploader name')
423 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
426 video_uploader_id = None
427 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
429 video_uploader_id = mobj.group(1)
431 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
434 if 'title' not in video_info:
435 self._downloader.trouble(u'ERROR: unable to extract video title')
437 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
440 if 'thumbnail_url' not in video_info:
441 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
443 else: # don't panic if we can't find it
444 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
448 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
450 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
451 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
452 for expression in format_expressions:
454 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
459 video_description = get_element_by_id("eow-description", video_webpage)
460 if video_description:
461 video_description = clean_html(video_description)
463 video_description = ''
466 video_subtitles = None
467 if self._downloader.params.get('writesubtitles', False):
468 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
470 self._downloader.trouble(srt_error)
472 if 'length_seconds' not in video_info:
473 self._downloader.trouble(u'WARNING: unable to extract video duration')
476 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
479 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
481 # Decide which formats to download
482 req_format = self._downloader.params.get('format', None)
484 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
485 self.report_rtmp_download()
486 video_url_list = [(None, video_info['conn'][0])]
487 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
488 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
489 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
490 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
491 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
493 format_limit = self._downloader.params.get('format_limit', None)
494 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
495 if format_limit is not None and format_limit in available_formats:
496 format_list = available_formats[available_formats.index(format_limit):]
498 format_list = available_formats
499 existing_formats = [x for x in format_list if x in url_map]
500 if len(existing_formats) == 0:
501 self._downloader.trouble(u'ERROR: no known formats available for video')
503 if self._downloader.params.get('listformats', None):
504 self._print_formats(existing_formats)
506 if req_format is None or req_format == 'best':
507 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
508 elif req_format == 'worst':
509 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
510 elif req_format in ('-1', 'all'):
511 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
513 # Specific formats. We pick the first in a slash-delimeted sequence.
514 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
515 req_formats = req_format.split('/')
516 video_url_list = None
517 for rf in req_formats:
519 video_url_list = [(rf, url_map[rf])]
521 if video_url_list is None:
522 self._downloader.trouble(u'ERROR: requested format not available')
525 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
529 for format_param, video_real_url in video_url_list:
531 video_extension = self._video_extensions.get(format_param, 'flv')
533 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
534 self._video_dimensions.get(format_param, '???'))
538 'url': video_real_url,
539 'uploader': video_uploader,
540 'uploader_id': video_uploader_id,
541 'upload_date': upload_date,
542 'title': video_title,
543 'ext': video_extension,
544 'format': video_format,
545 'thumbnail': video_thumbnail,
546 'description': video_description,
547 'player_url': player_url,
548 'subtitles': video_subtitles,
549 'duration': video_duration
554 class MetacafeIE(InfoExtractor):
555 """Information Extractor for metacafe.com."""
557 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
558 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
559 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
560 IE_NAME = u'metacafe'
562 def __init__(self, downloader=None):
563 InfoExtractor.__init__(self, downloader)
565 def report_disclaimer(self):
566 """Report disclaimer retrieval."""
567 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
569 def report_age_confirmation(self):
570 """Report attempt to confirm age."""
571 self._downloader.to_screen(u'[metacafe] Confirming age')
573 def report_download_webpage(self, video_id):
574 """Report webpage download."""
575 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
577 def report_extraction(self, video_id):
578 """Report information extraction."""
579 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
581 def _real_initialize(self):
582 # Retrieve disclaimer
583 request = compat_urllib_request.Request(self._DISCLAIMER)
585 self.report_disclaimer()
586 disclaimer = compat_urllib_request.urlopen(request).read()
587 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
588 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
594 'submit': "Continue - I'm over 18",
596 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
598 self.report_age_confirmation()
599 disclaimer = compat_urllib_request.urlopen(request).read()
600 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
601 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
604 def _real_extract(self, url):
605 # Extract id and simplified title from URL
606 mobj = re.match(self._VALID_URL, url)
608 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
611 video_id = mobj.group(1)
613 # Check if video comes from YouTube
614 mobj2 = re.match(r'^yt-(.*)$', video_id)
615 if mobj2 is not None:
616 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
619 # Retrieve video webpage to extract further information
620 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
622 self.report_download_webpage(video_id)
623 webpage = compat_urllib_request.urlopen(request).read()
624 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
625 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
628 # Extract URL, uploader and title from webpage
629 self.report_extraction(video_id)
630 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
632 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
633 video_extension = mediaURL[-3:]
635 # Extract gdaKey if available
636 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
640 gdaKey = mobj.group(1)
641 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
643 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
645 self._downloader.trouble(u'ERROR: unable to extract media URL')
647 vardict = compat_parse_qs(mobj.group(1))
648 if 'mediaData' not in vardict:
649 self._downloader.trouble(u'ERROR: unable to extract media URL')
651 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
653 self._downloader.trouble(u'ERROR: unable to extract media URL')
655 mediaURL = mobj.group(1).replace('\\/', '/')
656 video_extension = mediaURL[-3:]
657 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
659 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
661 self._downloader.trouble(u'ERROR: unable to extract title')
663 video_title = mobj.group(1).decode('utf-8')
665 mobj = re.search(r'submitter=(.*?);', webpage)
667 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
669 video_uploader = mobj.group(1)
672 'id': video_id.decode('utf-8'),
673 'url': video_url.decode('utf-8'),
674 'uploader': video_uploader.decode('utf-8'),
676 'title': video_title,
677 'ext': video_extension.decode('utf-8'),
681 class DailymotionIE(InfoExtractor):
682 """Information Extractor for Dailymotion"""
684 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
685 IE_NAME = u'dailymotion'
687 def __init__(self, downloader=None):
688 InfoExtractor.__init__(self, downloader)
690 def report_extraction(self, video_id):
691 """Report information extraction."""
692 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
694 def _real_extract(self, url):
695 # Extract id and simplified title from URL
696 mobj = re.match(self._VALID_URL, url)
698 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
701 video_id = mobj.group(1).split('_')[0].split('?')[0]
703 video_extension = 'mp4'
705 # Retrieve video webpage to extract further information
706 request = compat_urllib_request.Request(url)
707 request.add_header('Cookie', 'family_filter=off')
708 webpage = self._download_webpage(request, video_id)
710 # Extract URL, uploader and title from webpage
711 self.report_extraction(video_id)
712 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
714 self._downloader.trouble(u'ERROR: unable to extract media URL')
716 flashvars = compat_urllib_parse.unquote(mobj.group(1))
718 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
721 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
724 self._downloader.trouble(u'ERROR: unable to extract video URL')
727 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
729 self._downloader.trouble(u'ERROR: unable to extract video URL')
732 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
734 # TODO: support choosing qualities
736 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
738 self._downloader.trouble(u'ERROR: unable to extract title')
740 video_title = unescapeHTML(mobj.group('title'))
742 video_uploader = None
743 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
745 # lookin for official user
746 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
747 if mobj_official is None:
748 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
750 video_uploader = mobj_official.group(1)
752 video_uploader = mobj.group(1)
754 video_upload_date = None
755 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
757 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
762 'uploader': video_uploader,
763 'upload_date': video_upload_date,
764 'title': video_title,
765 'ext': video_extension,
769 class PhotobucketIE(InfoExtractor):
770 """Information extractor for photobucket.com."""
772 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
773 IE_NAME = u'photobucket'
775 def __init__(self, downloader=None):
776 InfoExtractor.__init__(self, downloader)
778 def report_download_webpage(self, video_id):
779 """Report webpage download."""
780 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
782 def report_extraction(self, video_id):
783 """Report information extraction."""
784 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
786 def _real_extract(self, url):
787 # Extract id from URL
788 mobj = re.match(self._VALID_URL, url)
790 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
793 video_id = mobj.group(1)
795 video_extension = 'flv'
797 # Retrieve video webpage to extract further information
798 request = compat_urllib_request.Request(url)
800 self.report_download_webpage(video_id)
801 webpage = compat_urllib_request.urlopen(request).read()
802 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
803 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
806 # Extract URL, uploader, and title from webpage
807 self.report_extraction(video_id)
808 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
810 self._downloader.trouble(u'ERROR: unable to extract media URL')
812 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
816 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
818 self._downloader.trouble(u'ERROR: unable to extract title')
820 video_title = mobj.group(1).decode('utf-8')
822 video_uploader = mobj.group(2).decode('utf-8')
825 'id': video_id.decode('utf-8'),
826 'url': video_url.decode('utf-8'),
827 'uploader': video_uploader,
829 'title': video_title,
830 'ext': video_extension.decode('utf-8'),
834 class YahooIE(InfoExtractor):
835 """Information extractor for video.yahoo.com."""
838 # _VALID_URL matches all Yahoo! Video URLs
839 # _VPAGE_URL matches only the extractable '/watch/' URLs
840 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
841 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
842 IE_NAME = u'video.yahoo'
844 def __init__(self, downloader=None):
845 InfoExtractor.__init__(self, downloader)
847 def report_download_webpage(self, video_id):
848 """Report webpage download."""
849 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
851 def report_extraction(self, video_id):
852 """Report information extraction."""
853 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
855 def _real_extract(self, url, new_video=True):
856 # Extract ID from URL
857 mobj = re.match(self._VALID_URL, url)
859 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
862 video_id = mobj.group(2)
863 video_extension = 'flv'
865 # Rewrite valid but non-extractable URLs as
866 # extractable English language /watch/ URLs
867 if re.match(self._VPAGE_URL, url) is None:
868 request = compat_urllib_request.Request(url)
870 webpage = compat_urllib_request.urlopen(request).read()
871 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
872 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
875 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
877 self._downloader.trouble(u'ERROR: Unable to extract id field')
879 yahoo_id = mobj.group(1)
881 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
883 self._downloader.trouble(u'ERROR: Unable to extract vid field')
885 yahoo_vid = mobj.group(1)
887 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
888 return self._real_extract(url, new_video=False)
890 # Retrieve video webpage to extract further information
891 request = compat_urllib_request.Request(url)
893 self.report_download_webpage(video_id)
894 webpage = compat_urllib_request.urlopen(request).read()
895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
896 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
899 # Extract uploader and title from webpage
900 self.report_extraction(video_id)
901 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
903 self._downloader.trouble(u'ERROR: unable to extract video title')
905 video_title = mobj.group(1).decode('utf-8')
907 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
909 self._downloader.trouble(u'ERROR: unable to extract video uploader')
911 video_uploader = mobj.group(1).decode('utf-8')
913 # Extract video thumbnail
914 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
916 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
918 video_thumbnail = mobj.group(1).decode('utf-8')
920 # Extract video description
921 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
923 self._downloader.trouble(u'ERROR: unable to extract video description')
925 video_description = mobj.group(1).decode('utf-8')
926 if not video_description:
927 video_description = 'No description available.'
929 # Extract video height and width
930 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
932 self._downloader.trouble(u'ERROR: unable to extract video height')
934 yv_video_height = mobj.group(1)
936 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
938 self._downloader.trouble(u'ERROR: unable to extract video width')
940 yv_video_width = mobj.group(1)
942 # Retrieve video playlist to extract media URL
943 # I'm not completely sure what all these options are, but we
944 # seem to need most of them, otherwise the server sends a 401.
945 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
946 yv_bitrate = '700' # according to Wikipedia this is hard-coded
947 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
948 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
949 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
951 self.report_download_webpage(video_id)
952 webpage = compat_urllib_request.urlopen(request).read()
953 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
954 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
957 # Extract media URL from playlist XML
958 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
960 self._downloader.trouble(u'ERROR: Unable to extract media URL')
962 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
963 video_url = unescapeHTML(video_url)
966 'id': video_id.decode('utf-8'),
968 'uploader': video_uploader,
970 'title': video_title,
971 'ext': video_extension.decode('utf-8'),
972 'thumbnail': video_thumbnail.decode('utf-8'),
973 'description': video_description,
977 class VimeoIE(InfoExtractor):
978 """Information extractor for vimeo.com."""
980 # _VALID_URL matches Vimeo URLs
981 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
984 def __init__(self, downloader=None):
985 InfoExtractor.__init__(self, downloader)
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
989 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
991 def report_extraction(self, video_id):
992 """Report information extraction."""
993 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
995 def _real_extract(self, url, new_video=True):
996 # Extract ID from URL
997 mobj = re.match(self._VALID_URL, url)
999 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1002 video_id = mobj.group('id')
1003 if not mobj.group('proto'):
1004 url = 'https://' + url
1005 if mobj.group('direct_link'):
1006 url = 'https://vimeo.com/' + video_id
1008 # Retrieve video webpage to extract further information
1009 request = compat_urllib_request.Request(url, None, std_headers)
1011 self.report_download_webpage(video_id)
1012 webpage_bytes = compat_urllib_request.urlopen(request).read()
1013 webpage = webpage_bytes.decode('utf-8')
1014 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1015 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1018 # Now we begin extracting as much information as we can from what we
1019 # retrieved. First we extract the information common to all extractors,
1020 # and latter we extract those that are Vimeo specific.
1021 self.report_extraction(video_id)
1023 # Extract the config JSON
1025 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1026 config = json.loads(config)
1028 self._downloader.trouble(u'ERROR: unable to extract info section')
1032 video_title = config["video"]["title"]
1034 # Extract uploader and uploader_id
1035 video_uploader = config["video"]["owner"]["name"]
1036 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1038 # Extract video thumbnail
1039 video_thumbnail = config["video"]["thumbnail"]
1041 # Extract video description
1042 video_description = get_element_by_attribute("itemprop", "description", webpage)
1043 if video_description: video_description = clean_html(video_description)
1044 else: video_description = ''
1046 # Extract upload date
1047 video_upload_date = None
1048 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1049 if mobj is not None:
1050 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1052 # Vimeo specific: extract request signature and timestamp
1053 sig = config['request']['signature']
1054 timestamp = config['request']['timestamp']
1056 # Vimeo specific: extract video codec and quality information
1057 # First consider quality, then codecs, then take everything
1058 # TODO bind to format param
1059 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1060 files = { 'hd': [], 'sd': [], 'other': []}
1061 for codec_name, codec_extension in codecs:
1062 if codec_name in config["video"]["files"]:
1063 if 'hd' in config["video"]["files"][codec_name]:
1064 files['hd'].append((codec_name, codec_extension, 'hd'))
1065 elif 'sd' in config["video"]["files"][codec_name]:
1066 files['sd'].append((codec_name, codec_extension, 'sd'))
1068 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1070 for quality in ('hd', 'sd', 'other'):
1071 if len(files[quality]) > 0:
1072 video_quality = files[quality][0][2]
1073 video_codec = files[quality][0][0]
1074 video_extension = files[quality][0][1]
1075 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1078 self._downloader.trouble(u'ERROR: no known codec found')
1081 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1082 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1087 'uploader': video_uploader,
1088 'uploader_id': video_uploader_id,
1089 'upload_date': video_upload_date,
1090 'title': video_title,
1091 'ext': video_extension,
1092 'thumbnail': video_thumbnail,
1093 'description': video_description,
1097 class ArteTvIE(InfoExtractor):
1098 """arte.tv information extractor."""
1100 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1101 _LIVE_URL = r'index-[0-9]+\.html$'
1103 IE_NAME = u'arte.tv'
1105 def __init__(self, downloader=None):
1106 InfoExtractor.__init__(self, downloader)
1108 def report_download_webpage(self, video_id):
1109 """Report webpage download."""
1110 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1112 def report_extraction(self, video_id):
1113 """Report information extraction."""
1114 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1116 def fetch_webpage(self, url):
1117 request = compat_urllib_request.Request(url)
1119 self.report_download_webpage(url)
1120 webpage = compat_urllib_request.urlopen(request).read()
1121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1122 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1124 except ValueError as err:
1125 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1129 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1130 page = self.fetch_webpage(url)
1131 mobj = re.search(regex, page, regexFlags)
1135 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1138 for (i, key, err) in matchTuples:
1139 if mobj.group(i) is None:
1140 self._downloader.trouble(err)
1143 info[key] = mobj.group(i)
1147 def extractLiveStream(self, url):
1148 video_lang = url.split('/')[-4]
1149 info = self.grep_webpage(
1151 r'src="(.*?/videothek_js.*?\.js)',
1154 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1157 http_host = url.split('/')[2]
1158 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1159 info = self.grep_webpage(
1161 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1162 '(http://.*?\.swf).*?' +
1166 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1167 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1168 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1171 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1173 def extractPlus7Stream(self, url):
1174 video_lang = url.split('/')[-3]
1175 info = self.grep_webpage(
1177 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1180 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1183 next_url = compat_urllib_parse.unquote(info.get('url'))
1184 info = self.grep_webpage(
1186 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1189 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1192 next_url = compat_urllib_parse.unquote(info.get('url'))
1194 info = self.grep_webpage(
1196 r'<video id="(.*?)".*?>.*?' +
1197 '<name>(.*?)</name>.*?' +
1198 '<dateVideo>(.*?)</dateVideo>.*?' +
1199 '<url quality="hd">(.*?)</url>',
1202 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1203 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1204 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1205 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1210 'id': info.get('id'),
1211 'url': compat_urllib_parse.unquote(info.get('url')),
1212 'uploader': u'arte.tv',
1213 'upload_date': info.get('date'),
1214 'title': info.get('title').decode('utf-8'),
1220 def _real_extract(self, url):
1221 video_id = url.split('/')[-1]
1222 self.report_extraction(video_id)
1224 if re.search(self._LIVE_URL, video_id) is not None:
1225 self.extractLiveStream(url)
1228 info = self.extractPlus7Stream(url)
1233 class GenericIE(InfoExtractor):
1234 """Generic last-resort information extractor."""
1237 IE_NAME = u'generic'
1239 def __init__(self, downloader=None):
1240 InfoExtractor.__init__(self, downloader)
1242 def report_download_webpage(self, video_id):
1243 """Report webpage download."""
1244 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1245 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1247 def report_extraction(self, video_id):
1248 """Report information extraction."""
1249 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1251 def report_following_redirect(self, new_url):
1252 """Report information extraction."""
1253 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1255 def _test_redirect(self, url):
1256 """Check if it is a redirect, like url shorteners, in case restart chain."""
1257 class HeadRequest(compat_urllib_request.Request):
1258 def get_method(self):
1261 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1263 Subclass the HTTPRedirectHandler to make it use our
1264 HeadRequest also on the redirected URL
1266 def redirect_request(self, req, fp, code, msg, headers, newurl):
1267 if code in (301, 302, 303, 307):
1268 newurl = newurl.replace(' ', '%20')
1269 newheaders = dict((k,v) for k,v in req.headers.items()
1270 if k.lower() not in ("content-length", "content-type"))
1271 return HeadRequest(newurl,
1273 origin_req_host=req.get_origin_req_host(),
1276 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1278 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1280 Fallback to GET if HEAD is not allowed (405 HTTP error)
1282 def http_error_405(self, req, fp, code, msg, headers):
1286 newheaders = dict((k,v) for k,v in req.headers.items()
1287 if k.lower() not in ("content-length", "content-type"))
1288 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1290 origin_req_host=req.get_origin_req_host(),
1294 opener = compat_urllib_request.OpenerDirector()
1295 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1296 HTTPMethodFallback, HEADRedirectHandler,
1297 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1298 opener.add_handler(handler())
1300 response = opener.open(HeadRequest(url))
1301 new_url = response.geturl()
1306 self.report_following_redirect(new_url)
1307 self._downloader.download([new_url])
1310 def _real_extract(self, url):
1311 if self._test_redirect(url): return
1313 video_id = url.split('/')[-1]
1314 request = compat_urllib_request.Request(url)
1316 self.report_download_webpage(video_id)
1317 webpage = compat_urllib_request.urlopen(request).read()
1318 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1319 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1321 except ValueError as err:
1322 # since this is the last-resort InfoExtractor, if
1323 # this error is thrown, it'll be thrown here
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1327 self.report_extraction(video_id)
1328 # Start with something easy: JW Player in SWFObject
1329 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1331 # Broaden the search a little bit
1332 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1334 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1337 # It's possible that one of the regexes
1338 # matched, but returned an empty group:
1339 if mobj.group(1) is None:
1340 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1343 video_url = compat_urllib_parse.unquote(mobj.group(1))
1344 video_id = os.path.basename(video_url)
1346 # here's a fun little line of code for you:
1347 video_extension = os.path.splitext(video_id)[1][1:]
1348 video_id = os.path.splitext(video_id)[0]
1350 # it's tempting to parse this further, but you would
1351 # have to take into account all the variations like
1352 # Video Title - Site Name
1353 # Site Name | Video Title
1354 # Video Title - Tagline | Site Name
1355 # and so on and so forth; it's just not practical
1356 mobj = re.search(r'<title>(.*)</title>', webpage)
1358 self._downloader.trouble(u'ERROR: unable to extract title')
1360 video_title = mobj.group(1)
1362 # video uploader is domain name
1363 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1365 self._downloader.trouble(u'ERROR: unable to extract title')
1367 video_uploader = mobj.group(1)
1372 'uploader': video_uploader,
1373 'upload_date': None,
1374 'title': video_title,
1375 'ext': video_extension,
1379 class YoutubeSearchIE(InfoExtractor):
1380 """Information Extractor for YouTube search queries."""
1381 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1382 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1383 _max_youtube_results = 1000
1384 IE_NAME = u'youtube:search'
1386 def __init__(self, downloader=None):
1387 InfoExtractor.__init__(self, downloader)
1389 def report_download_page(self, query, pagenum):
1390 """Report attempt to download search page with given number."""
1391 query = query.decode(preferredencoding())
1392 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1394 def _real_extract(self, query):
1395 mobj = re.match(self._VALID_URL, query)
1397 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1400 prefix, query = query.split(':')
1402 query = query.encode('utf-8')
1404 self._download_n_results(query, 1)
1406 elif prefix == 'all':
1407 self._download_n_results(query, self._max_youtube_results)
1413 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1415 elif n > self._max_youtube_results:
1416 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1417 n = self._max_youtube_results
1418 self._download_n_results(query, n)
1420 except ValueError: # parsing prefix as integer fails
1421 self._download_n_results(query, 1)
1424 def _download_n_results(self, query, n):
1425 """Downloads a specified number of results for a query"""
1431 while (50 * pagenum) < limit:
1432 self.report_download_page(query, pagenum+1)
1433 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1434 request = compat_urllib_request.Request(result_url)
1436 data = compat_urllib_request.urlopen(request).read()
1437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1438 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1440 api_response = json.loads(data)['data']
1442 new_ids = list(video['id'] for video in api_response['items'])
1443 video_ids += new_ids
1445 limit = min(n, api_response['totalItems'])
1448 if len(video_ids) > n:
1449 video_ids = video_ids[:n]
1450 for id in video_ids:
1451 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1455 class GoogleSearchIE(InfoExtractor):
1456 """Information Extractor for Google Video search queries."""
1457 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1458 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1459 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1460 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1461 _max_google_results = 1000
1462 IE_NAME = u'video.google:search'
1464 def __init__(self, downloader=None):
1465 InfoExtractor.__init__(self, downloader)
1467 def report_download_page(self, query, pagenum):
1468 """Report attempt to download playlist page with given number."""
1469 query = query.decode(preferredencoding())
1470 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1472 def _real_extract(self, query):
1473 mobj = re.match(self._VALID_URL, query)
1475 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1478 prefix, query = query.split(':')
1480 query = query.encode('utf-8')
1482 self._download_n_results(query, 1)
1484 elif prefix == 'all':
1485 self._download_n_results(query, self._max_google_results)
1491 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1493 elif n > self._max_google_results:
1494 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1495 n = self._max_google_results
1496 self._download_n_results(query, n)
1498 except ValueError: # parsing prefix as integer fails
1499 self._download_n_results(query, 1)
1502 def _download_n_results(self, query, n):
1503 """Downloads a specified number of results for a query"""
1509 self.report_download_page(query, pagenum)
1510 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1511 request = compat_urllib_request.Request(result_url)
1513 page = compat_urllib_request.urlopen(request).read()
1514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1515 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1518 # Extract video identifiers
1519 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1520 video_id = mobj.group(1)
1521 if video_id not in video_ids:
1522 video_ids.append(video_id)
1523 if len(video_ids) == n:
1524 # Specified n videos reached
1525 for id in video_ids:
1526 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1529 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1530 for id in video_ids:
1531 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1534 pagenum = pagenum + 1
1537 class YahooSearchIE(InfoExtractor):
1538 """Information Extractor for Yahoo! Video search queries."""
1541 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1542 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1543 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1544 _MORE_PAGES_INDICATOR = r'\s*Next'
1545 _max_yahoo_results = 1000
1546 IE_NAME = u'video.yahoo:search'
1548 def __init__(self, downloader=None):
1549 InfoExtractor.__init__(self, downloader)
1551 def report_download_page(self, query, pagenum):
1552 """Report attempt to download playlist page with given number."""
1553 query = query.decode(preferredencoding())
1554 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1556 def _real_extract(self, query):
1557 mobj = re.match(self._VALID_URL, query)
1559 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1562 prefix, query = query.split(':')
1564 query = query.encode('utf-8')
1566 self._download_n_results(query, 1)
1568 elif prefix == 'all':
1569 self._download_n_results(query, self._max_yahoo_results)
1575 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1577 elif n > self._max_yahoo_results:
1578 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1579 n = self._max_yahoo_results
1580 self._download_n_results(query, n)
1582 except ValueError: # parsing prefix as integer fails
1583 self._download_n_results(query, 1)
1586 def _download_n_results(self, query, n):
1587 """Downloads a specified number of results for a query"""
1590 already_seen = set()
1594 self.report_download_page(query, pagenum)
1595 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1596 request = compat_urllib_request.Request(result_url)
1598 page = compat_urllib_request.urlopen(request).read()
1599 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1600 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1603 # Extract video identifiers
1604 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1605 video_id = mobj.group(1)
1606 if video_id not in already_seen:
1607 video_ids.append(video_id)
1608 already_seen.add(video_id)
1609 if len(video_ids) == n:
1610 # Specified n videos reached
1611 for id in video_ids:
1612 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1615 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1616 for id in video_ids:
1617 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1620 pagenum = pagenum + 1
1623 class YoutubePlaylistIE(InfoExtractor):
1624 """Information Extractor for YouTube playlists."""
1626 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1627 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1628 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1629 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1630 IE_NAME = u'youtube:playlist'
1632 def __init__(self, downloader=None):
1633 InfoExtractor.__init__(self, downloader)
1635 def report_download_page(self, playlist_id, pagenum):
1636 """Report attempt to download playlist page with given number."""
1637 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1639 def _real_extract(self, url):
1640 # Extract playlist id
1641 mobj = re.match(self._VALID_URL, url)
1643 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1647 if mobj.group(3) is not None:
1648 self._downloader.download([mobj.group(3)])
1651 # Download playlist pages
1652 # prefix is 'p' as default for playlists but there are other types that need extra care
1653 playlist_prefix = mobj.group(1)
1654 if playlist_prefix == 'a':
1655 playlist_access = 'artist'
1657 playlist_prefix = 'p'
1658 playlist_access = 'view_play_list'
1659 playlist_id = mobj.group(2)
1664 self.report_download_page(playlist_id, pagenum)
1665 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1666 request = compat_urllib_request.Request(url)
1668 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1670 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1673 # Extract video identifiers
1675 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1676 if mobj.group(1) not in ids_in_page:
1677 ids_in_page.append(mobj.group(1))
1678 video_ids.extend(ids_in_page)
1680 if self._MORE_PAGES_INDICATOR not in page:
1682 pagenum = pagenum + 1
1684 total = len(video_ids)
1686 playliststart = self._downloader.params.get('playliststart', 1) - 1
1687 playlistend = self._downloader.params.get('playlistend', -1)
1688 if playlistend == -1:
1689 video_ids = video_ids[playliststart:]
1691 video_ids = video_ids[playliststart:playlistend]
1693 if len(video_ids) == total:
1694 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1696 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1698 for id in video_ids:
1699 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1703 class YoutubeChannelIE(InfoExtractor):
1704 """Information Extractor for YouTube channels."""
1706 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1707 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1708 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1709 IE_NAME = u'youtube:channel'
1711 def report_download_page(self, channel_id, pagenum):
1712 """Report attempt to download channel page with given number."""
1713 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1715 def _real_extract(self, url):
1716 # Extract channel id
1717 mobj = re.match(self._VALID_URL, url)
1719 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1722 # Download channel pages
1723 channel_id = mobj.group(1)
1728 self.report_download_page(channel_id, pagenum)
1729 url = self._TEMPLATE_URL % (channel_id, pagenum)
1730 request = compat_urllib_request.Request(url)
1732 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1733 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1734 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1737 # Extract video identifiers
1739 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1740 if mobj.group(1) not in ids_in_page:
1741 ids_in_page.append(mobj.group(1))
1742 video_ids.extend(ids_in_page)
1744 if self._MORE_PAGES_INDICATOR not in page:
1746 pagenum = pagenum + 1
1748 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1750 for id in video_ids:
1751 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1755 class YoutubeUserIE(InfoExtractor):
1756 """Information Extractor for YouTube users."""
1758 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1759 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1760 _GDATA_PAGE_SIZE = 50
1761 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1762 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1763 IE_NAME = u'youtube:user'
1765 def __init__(self, downloader=None):
1766 InfoExtractor.__init__(self, downloader)
1768 def report_download_page(self, username, start_index):
1769 """Report attempt to download user page."""
1770 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1771 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1773 def _real_extract(self, url):
1775 mobj = re.match(self._VALID_URL, url)
1777 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1780 username = mobj.group(1)
1782 # Download video ids using YouTube Data API. Result size per
1783 # query is limited (currently to 50 videos) so we need to query
1784 # page by page until there are no video ids - it means we got
1791 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1792 self.report_download_page(username, start_index)
1794 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1797 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1799 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1802 # Extract video identifiers
1805 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1806 if mobj.group(1) not in ids_in_page:
1807 ids_in_page.append(mobj.group(1))
1809 video_ids.extend(ids_in_page)
1811 # A little optimization - if current page is not
1812 # "full", ie. does not contain PAGE_SIZE video ids then
1813 # we can assume that this page is the last one - there
1814 # are no more ids on further pages - no need to query
1817 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1822 all_ids_count = len(video_ids)
1823 playliststart = self._downloader.params.get('playliststart', 1) - 1
1824 playlistend = self._downloader.params.get('playlistend', -1)
1826 if playlistend == -1:
1827 video_ids = video_ids[playliststart:]
1829 video_ids = video_ids[playliststart:playlistend]
1831 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1832 (username, all_ids_count, len(video_ids)))
1834 for video_id in video_ids:
1835 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1838 class BlipTVUserIE(InfoExtractor):
1839 """Information Extractor for blip.tv users."""
1841 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1843 IE_NAME = u'blip.tv:user'
1845 def __init__(self, downloader=None):
1846 InfoExtractor.__init__(self, downloader)
1848 def report_download_page(self, username, pagenum):
1849 """Report attempt to download user page."""
1850 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1851 (self.IE_NAME, username, pagenum))
1853 def _real_extract(self, url):
1855 mobj = re.match(self._VALID_URL, url)
1857 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1860 username = mobj.group(1)
1862 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1864 request = compat_urllib_request.Request(url)
1867 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1868 mobj = re.search(r'data-users-id="([^"]+)"', page)
1869 page_base = page_base % mobj.group(1)
1870 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1871 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1875 # Download video ids using BlipTV Ajax calls. Result size per
1876 # query is limited (currently to 12 videos) so we need to query
1877 # page by page until there are no video ids - it means we got
1884 self.report_download_page(username, pagenum)
1886 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1889 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1891 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1894 # Extract video identifiers
1897 for mobj in re.finditer(r'href="/([^"]+)"', page):
1898 if mobj.group(1) not in ids_in_page:
1899 ids_in_page.append(unescapeHTML(mobj.group(1)))
1901 video_ids.extend(ids_in_page)
1903 # A little optimization - if current page is not
1904 # "full", ie. does not contain PAGE_SIZE video ids then
1905 # we can assume that this page is the last one - there
1906 # are no more ids on further pages - no need to query
1909 if len(ids_in_page) < self._PAGE_SIZE:
1914 all_ids_count = len(video_ids)
1915 playliststart = self._downloader.params.get('playliststart', 1) - 1
1916 playlistend = self._downloader.params.get('playlistend', -1)
1918 if playlistend == -1:
1919 video_ids = video_ids[playliststart:]
1921 video_ids = video_ids[playliststart:playlistend]
1923 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1924 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1926 for video_id in video_ids:
1927 self._downloader.download([u'http://blip.tv/'+video_id])
1930 class DepositFilesIE(InfoExtractor):
1931 """Information extractor for depositfiles.com"""
1933 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1935 def report_download_webpage(self, file_id):
1936 """Report webpage download."""
1937 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1939 def report_extraction(self, file_id):
1940 """Report information extraction."""
1941 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1943 def _real_extract(self, url):
1944 file_id = url.split('/')[-1]
1945 # Rebuild url in english locale
1946 url = 'http://depositfiles.com/en/files/' + file_id
1948 # Retrieve file webpage with 'Free download' button pressed
1949 free_download_indication = { 'gateway_result' : '1' }
1950 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1952 self.report_download_webpage(file_id)
1953 webpage = compat_urllib_request.urlopen(request).read()
1954 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1955 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1958 # Search for the real file URL
1959 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1960 if (mobj is None) or (mobj.group(1) is None):
1961 # Try to figure out reason of the error.
1962 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1963 if (mobj is not None) and (mobj.group(1) is not None):
1964 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1965 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1967 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1970 file_url = mobj.group(1)
1971 file_extension = os.path.splitext(file_url)[1][1:]
1973 # Search for file title
1974 mobj = re.search(r'<b title="(.*?)">', webpage)
1976 self._downloader.trouble(u'ERROR: unable to extract title')
1978 file_title = mobj.group(1).decode('utf-8')
1981 'id': file_id.decode('utf-8'),
1982 'url': file_url.decode('utf-8'),
1984 'upload_date': None,
1985 'title': file_title,
1986 'ext': file_extension.decode('utf-8'),
1990 class FacebookIE(InfoExtractor):
1991 """Information Extractor for Facebook"""
1993 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1994 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1995 _NETRC_MACHINE = 'facebook'
1996 IE_NAME = u'facebook'
1998 def report_login(self):
1999 """Report attempt to log in."""
2000 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2002 def _real_initialize(self):
2003 if self._downloader is None:
2008 downloader_params = self._downloader.params
2010 # Attempt to use provided username and password or .netrc data
2011 if downloader_params.get('username', None) is not None:
2012 useremail = downloader_params['username']
2013 password = downloader_params['password']
2014 elif downloader_params.get('usenetrc', False):
2016 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2017 if info is not None:
2021 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2022 except (IOError, netrc.NetrcParseError) as err:
2023 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2026 if useremail is None:
2035 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2038 login_results = compat_urllib_request.urlopen(request).read()
2039 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2040 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2042 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2043 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2046 def _real_extract(self, url):
2047 mobj = re.match(self._VALID_URL, url)
2049 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2051 video_id = mobj.group('ID')
2053 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2054 webpage = self._download_webpage(url, video_id)
2056 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2057 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2058 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2060 raise ExtractorError(u'Cannot parse data')
2061 data = dict(json.loads(m.group(1)))
2062 params_raw = compat_urllib_parse.unquote(data['params'])
2063 params = json.loads(params_raw)
2064 video_url = params['hd_src']
2065 video_duration = int(params['video_duration'])
2067 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2069 raise ExtractorError(u'Cannot find title in webpage')
2070 video_title = unescapeHTML(m.group(1))
2074 'title': video_title,
2077 'duration': video_duration,
2078 'thumbnail': params['thumbnail_src'],
2083 class BlipTVIE(InfoExtractor):
2084 """Information extractor for blip.tv"""
2086 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2087 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2088 IE_NAME = u'blip.tv'
2090 def report_extraction(self, file_id):
2091 """Report information extraction."""
2092 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2094 def report_direct_download(self, title):
2095 """Report information extraction."""
2096 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2098 def _real_extract(self, url):
2099 mobj = re.match(self._VALID_URL, url)
2101 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2108 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2109 request = compat_urllib_request.Request(json_url)
2110 request.add_header('User-Agent', 'iTunes/10.6.1')
2111 self.report_extraction(mobj.group(1))
2114 urlh = compat_urllib_request.urlopen(request)
2115 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2116 basename = url.split('/')[-1]
2117 title,ext = os.path.splitext(basename)
2118 title = title.decode('UTF-8')
2119 ext = ext.replace('.', '')
2120 self.report_direct_download(title)
2125 'upload_date': None,
2130 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2131 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2132 if info is None: # Regular URL
2134 json_code_bytes = urlh.read()
2135 json_code = json_code_bytes.decode('utf-8')
2136 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2137 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2141 json_data = json.loads(json_code)
2142 if 'Post' in json_data:
2143 data = json_data['Post']
2147 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2148 video_url = data['media']['url']
2149 umobj = re.match(self._URL_EXT, video_url)
2151 raise ValueError('Can not determine filename extension')
2152 ext = umobj.group(1)
2155 'id': data['item_id'],
2157 'uploader': data['display_name'],
2158 'upload_date': upload_date,
2159 'title': data['title'],
2161 'format': data['media']['mimeType'],
2162 'thumbnail': data['thumbnailUrl'],
2163 'description': data['description'],
2164 'player_url': data['embedUrl'],
2165 'user_agent': 'iTunes/10.6.1',
2167 except (ValueError,KeyError) as err:
2168 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2174 class MyVideoIE(InfoExtractor):
2175 """Information Extractor for myvideo.de."""
2177 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2178 IE_NAME = u'myvideo'
2180 def __init__(self, downloader=None):
2181 InfoExtractor.__init__(self, downloader)
2183 def report_extraction(self, video_id):
2184 """Report information extraction."""
2185 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2187 def _real_extract(self,url):
2188 mobj = re.match(self._VALID_URL, url)
2190 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2193 video_id = mobj.group(1)
2196 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2197 webpage = self._download_webpage(webpage_url, video_id)
2199 self.report_extraction(video_id)
2200 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2203 self._downloader.trouble(u'ERROR: unable to extract media URL')
2205 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2207 mobj = re.search('<title>([^<]+)</title>', webpage)
2209 self._downloader.trouble(u'ERROR: unable to extract title')
2212 video_title = mobj.group(1)
2218 'upload_date': None,
2219 'title': video_title,
2223 class ComedyCentralIE(InfoExtractor):
2224 """Information extractor for The Daily Show and Colbert Report """
2226 # urls can be abbreviations like :thedailyshow or :colbert
2227 # urls for episodes like:
2228 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2229 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2230 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2231 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2232 |(https?://)?(www\.)?
2233 (?P<showname>thedailyshow|colbertnation)\.com/
2234 (full-episodes/(?P<episode>.*)|
2236 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2237 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2240 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2242 _video_extensions = {
2250 _video_dimensions = {
2259 def suitable(self, url):
2260 """Receives a URL and returns True if suitable for this IE."""
2261 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2263 def report_extraction(self, episode_id):
2264 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2266 def report_config_download(self, episode_id, media_id):
2267 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2269 def report_index_download(self, episode_id):
2270 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2272 def _print_formats(self, formats):
2273 print('Available formats:')
2275 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2278 def _real_extract(self, url):
2279 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2281 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2284 if mobj.group('shortname'):
2285 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2286 url = u'http://www.thedailyshow.com/full-episodes/'
2288 url = u'http://www.colbertnation.com/full-episodes/'
2289 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2290 assert mobj is not None
2292 if mobj.group('clip'):
2293 if mobj.group('showname') == 'thedailyshow':
2294 epTitle = mobj.group('tdstitle')
2296 epTitle = mobj.group('cntitle')
2299 dlNewest = not mobj.group('episode')
2301 epTitle = mobj.group('showname')
2303 epTitle = mobj.group('episode')
2305 req = compat_urllib_request.Request(url)
2306 self.report_extraction(epTitle)
2308 htmlHandle = compat_urllib_request.urlopen(req)
2309 html = htmlHandle.read()
2310 webpage = html.decode('utf-8')
2311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2312 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2315 url = htmlHandle.geturl()
2316 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2318 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2320 if mobj.group('episode') == '':
2321 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2323 epTitle = mobj.group('episode')
2325 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2327 if len(mMovieParams) == 0:
2328 # The Colbert Report embeds the information in a without
2329 # a URL prefix; so extract the alternate reference
2330 # and then add the URL prefix manually.
2332 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2333 if len(altMovieParams) == 0:
2334 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2337 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2339 uri = mMovieParams[0][1]
2340 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2341 self.report_index_download(epTitle)
2343 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2344 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2345 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2350 idoc = xml.etree.ElementTree.fromstring(indexXml)
2351 itemEls = idoc.findall('.//item')
2352 for partNum,itemEl in enumerate(itemEls):
2353 mediaId = itemEl.findall('./guid')[0].text
2354 shortMediaId = mediaId.split(':')[-1]
2355 showId = mediaId.split(':')[-2].replace('.com', '')
2356 officialTitle = itemEl.findall('./title')[0].text
2357 officialDate = itemEl.findall('./pubDate')[0].text
2359 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2360 compat_urllib_parse.urlencode({'uri': mediaId}))
2361 configReq = compat_urllib_request.Request(configUrl)
2362 self.report_config_download(epTitle, shortMediaId)
2364 configXml = compat_urllib_request.urlopen(configReq).read()
2365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2366 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2369 cdoc = xml.etree.ElementTree.fromstring(configXml)
2371 for rendition in cdoc.findall('.//rendition'):
2372 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2376 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2379 if self._downloader.params.get('listformats', None):
2380 self._print_formats([i[0] for i in turls])
2383 # For now, just pick the highest bitrate
2384 format,rtmp_video_url = turls[-1]
2386 # Get the format arg from the arg stream
2387 req_format = self._downloader.params.get('format', None)
2389 # Select format if we can find one
2392 format, rtmp_video_url = f, v
2395 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2397 raise ExtractorError(u'Cannot transform RTMP url')
2398 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2399 video_url = base + m.group('finalid')
2401 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2406 'upload_date': officialDate,
2411 'description': officialTitle,
2413 results.append(info)
2418 class EscapistIE(InfoExtractor):
2419 """Information extractor for The Escapist """
2421 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2422 IE_NAME = u'escapist'
2424 def report_extraction(self, showName):
2425 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2427 def report_config_download(self, showName):
2428 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2430 def _real_extract(self, url):
2431 mobj = re.match(self._VALID_URL, url)
2433 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2435 showName = mobj.group('showname')
2436 videoId = mobj.group('episode')
2438 self.report_extraction(showName)
2440 webPage = compat_urllib_request.urlopen(url)
2441 webPageBytes = webPage.read()
2442 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2443 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2445 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2448 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2449 description = unescapeHTML(descMatch.group(1))
2450 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2451 imgUrl = unescapeHTML(imgMatch.group(1))
2452 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2453 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2454 configUrlMatch = re.search('config=(.*)$', playerUrl)
2455 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2457 self.report_config_download(showName)
2459 configJSON = compat_urllib_request.urlopen(configUrl)
2460 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2461 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2466 # Technically, it's JavaScript, not JSON
2467 configJSON = configJSON.replace("'", '"')
2470 config = json.loads(configJSON)
2471 except (ValueError,) as err:
2472 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2475 playlist = config['playlist']
2476 videoUrl = playlist[1]['url']
2481 'uploader': showName,
2482 'upload_date': None,
2485 'thumbnail': imgUrl,
2486 'description': description,
2487 'player_url': playerUrl,
2492 class CollegeHumorIE(InfoExtractor):
2493 """Information extractor for collegehumor.com"""
2496 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2497 IE_NAME = u'collegehumor'
2499 def report_manifest(self, video_id):
2500 """Report information extraction."""
2501 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2503 def report_extraction(self, video_id):
2504 """Report information extraction."""
2505 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2507 def _real_extract(self, url):
2508 mobj = re.match(self._VALID_URL, url)
2510 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2512 video_id = mobj.group('videoid')
2517 'upload_date': None,
2520 self.report_extraction(video_id)
2521 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2523 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2524 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2528 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2530 videoNode = mdoc.findall('./video')[0]
2531 info['description'] = videoNode.findall('./description')[0].text
2532 info['title'] = videoNode.findall('./caption')[0].text
2533 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2534 manifest_url = videoNode.findall('./file')[0].text
2536 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2539 manifest_url += '?hdcore=2.10.3'
2540 self.report_manifest(video_id)
2542 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2543 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2544 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2547 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2549 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2550 node_id = media_node.attrib['url']
2551 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2552 except IndexError as err:
2553 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2556 url_pr = compat_urllib_parse_urlparse(manifest_url)
2557 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2564 class XVideosIE(InfoExtractor):
2565 """Information extractor for xvideos.com"""
2567 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2568 IE_NAME = u'xvideos'
2570 def report_extraction(self, video_id):
2571 """Report information extraction."""
2572 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2574 def _real_extract(self, url):
2575 mobj = re.match(self._VALID_URL, url)
2577 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2579 video_id = mobj.group(1)
2581 webpage = self._download_webpage(url, video_id)
2583 self.report_extraction(video_id)
2587 mobj = re.search(r'flv_url=(.+?)&', webpage)
2589 self._downloader.trouble(u'ERROR: unable to extract video url')
2591 video_url = compat_urllib_parse.unquote(mobj.group(1))
2595 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2597 self._downloader.trouble(u'ERROR: unable to extract video title')
2599 video_title = mobj.group(1)
2602 # Extract video thumbnail
2603 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2605 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2607 video_thumbnail = mobj.group(0)
2613 'upload_date': None,
2614 'title': video_title,
2616 'thumbnail': video_thumbnail,
2617 'description': None,
2623 class SoundcloudIE(InfoExtractor):
2624 """Information extractor for soundcloud.com
2625 To access the media, the uid of the song and a stream token
2626 must be extracted from the page source and the script must make
2627 a request to media.soundcloud.com/crossdomain.xml. Then
2628 the media can be grabbed by requesting from an url composed
2629 of the stream token and uid
2632 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2633 IE_NAME = u'soundcloud'
2635 def __init__(self, downloader=None):
2636 InfoExtractor.__init__(self, downloader)
2638 def report_resolve(self, video_id):
2639 """Report information extraction."""
2640 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2642 def report_extraction(self, video_id):
2643 """Report information extraction."""
2644 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2646 def _real_extract(self, url):
2647 mobj = re.match(self._VALID_URL, url)
2649 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2652 # extract uploader (which is in the url)
2653 uploader = mobj.group(1)
2654 # extract simple title (uploader + slug of song title)
2655 slug_title = mobj.group(2)
2656 simple_title = uploader + u'-' + slug_title
2658 self.report_resolve('%s/%s' % (uploader, slug_title))
2660 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2661 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2662 request = compat_urllib_request.Request(resolv_url)
2664 info_json_bytes = compat_urllib_request.urlopen(request).read()
2665 info_json = info_json_bytes.decode('utf-8')
2666 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2667 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2670 info = json.loads(info_json)
2671 video_id = info['id']
2672 self.report_extraction('%s/%s' % (uploader, slug_title))
2674 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2675 request = compat_urllib_request.Request(streams_url)
2677 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2678 stream_json = stream_json_bytes.decode('utf-8')
2679 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2680 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2683 streams = json.loads(stream_json)
2684 mediaURL = streams['http_mp3_128_url']
2689 'uploader': info['user']['username'],
2690 'upload_date': info['created_at'],
2691 'title': info['title'],
2693 'description': info['description'],
2697 class InfoQIE(InfoExtractor):
2698 """Information extractor for infoq.com"""
2699 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2701 def report_extraction(self, video_id):
2702 """Report information extraction."""
2703 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2705 def _real_extract(self, url):
2706 mobj = re.match(self._VALID_URL, url)
2708 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2711 webpage = self._download_webpage(url, video_id=url)
2712 self.report_extraction(url)
2715 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2717 self._downloader.trouble(u'ERROR: unable to extract video url')
2719 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2720 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2723 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2725 self._downloader.trouble(u'ERROR: unable to extract video title')
2727 video_title = mobj.group(1)
2729 # Extract description
2730 video_description = u'No description available.'
2731 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2732 if mobj is not None:
2733 video_description = mobj.group(1)
2735 video_filename = video_url.split('/')[-1]
2736 video_id, extension = video_filename.split('.')
2742 'upload_date': None,
2743 'title': video_title,
2744 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2746 'description': video_description,
2751 class MixcloudIE(InfoExtractor):
2752 """Information extractor for www.mixcloud.com"""
2754 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2755 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2756 IE_NAME = u'mixcloud'
2758 def __init__(self, downloader=None):
2759 InfoExtractor.__init__(self, downloader)
2761 def report_download_json(self, file_id):
2762 """Report JSON download."""
2763 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2765 def report_extraction(self, file_id):
2766 """Report information extraction."""
2767 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2769 def get_urls(self, jsonData, fmt, bitrate='best'):
2770 """Get urls from 'audio_formats' section in json"""
2773 bitrate_list = jsonData[fmt]
2774 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2775 bitrate = max(bitrate_list) # select highest
2777 url_list = jsonData[fmt][bitrate]
2778 except TypeError: # we have no bitrate info.
2779 url_list = jsonData[fmt]
2782 def check_urls(self, url_list):
2783 """Returns 1st active url from list"""
2784 for url in url_list:
2786 compat_urllib_request.urlopen(url)
2788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2793 def _print_formats(self, formats):
2794 print('Available formats:')
2795 for fmt in formats.keys():
2796 for b in formats[fmt]:
2798 ext = formats[fmt][b][0]
2799 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2800 except TypeError: # we have no bitrate info
2801 ext = formats[fmt][0]
2802 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2805 def _real_extract(self, url):
2806 mobj = re.match(self._VALID_URL, url)
2808 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2810 # extract uploader & filename from url
2811 uploader = mobj.group(1).decode('utf-8')
2812 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2814 # construct API request
2815 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2816 # retrieve .json file with links to files
2817 request = compat_urllib_request.Request(file_url)
2819 self.report_download_json(file_url)
2820 jsonData = compat_urllib_request.urlopen(request).read()
2821 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2822 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2826 json_data = json.loads(jsonData)
2827 player_url = json_data['player_swf_url']
2828 formats = dict(json_data['audio_formats'])
2830 req_format = self._downloader.params.get('format', None)
2833 if self._downloader.params.get('listformats', None):
2834 self._print_formats(formats)
2837 if req_format is None or req_format == 'best':
2838 for format_param in formats.keys():
2839 url_list = self.get_urls(formats, format_param)
2841 file_url = self.check_urls(url_list)
2842 if file_url is not None:
2845 if req_format not in formats:
2846 self._downloader.trouble(u'ERROR: format is not available')
2849 url_list = self.get_urls(formats, req_format)
2850 file_url = self.check_urls(url_list)
2851 format_param = req_format
2854 'id': file_id.decode('utf-8'),
2855 'url': file_url.decode('utf-8'),
2856 'uploader': uploader.decode('utf-8'),
2857 'upload_date': None,
2858 'title': json_data['name'],
2859 'ext': file_url.split('.')[-1].decode('utf-8'),
2860 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2861 'thumbnail': json_data['thumbnail_url'],
2862 'description': json_data['description'],
2863 'player_url': player_url.decode('utf-8'),
2866 class StanfordOpenClassroomIE(InfoExtractor):
2867 """Information extractor for Stanford's Open ClassRoom"""
2869 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2870 IE_NAME = u'stanfordoc'
2872 def report_download_webpage(self, objid):
2873 """Report information extraction."""
2874 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2876 def report_extraction(self, video_id):
2877 """Report information extraction."""
2878 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2880 def _real_extract(self, url):
2881 mobj = re.match(self._VALID_URL, url)
2883 raise ExtractorError(u'Invalid URL: %s' % url)
2885 if mobj.group('course') and mobj.group('video'): # A specific video
2886 course = mobj.group('course')
2887 video = mobj.group('video')
2889 'id': course + '_' + video,
2891 'upload_date': None,
2894 self.report_extraction(info['id'])
2895 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2896 xmlUrl = baseUrl + video + '.xml'
2898 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2902 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2904 info['title'] = mdoc.findall('./title')[0].text
2905 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2907 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2909 info['ext'] = info['url'].rpartition('.')[2]
2911 elif mobj.group('course'): # A course page
2912 course = mobj.group('course')
2917 'upload_date': None,
2920 coursepage = self._download_webpage(url, info['id'],
2921 note='Downloading course info page',
2922 errnote='Unable to download course info page')
2924 m = re.search('<h1>([^<]+)</h1>', coursepage)
2926 info['title'] = unescapeHTML(m.group(1))
2928 info['title'] = info['id']
2930 m = re.search('<description>([^<]+)</description>', coursepage)
2932 info['description'] = unescapeHTML(m.group(1))
2934 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2937 'type': 'reference',
2938 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2942 for entry in info['list']:
2943 assert entry['type'] == 'reference'
2944 results += self.extract(entry['url'])
2948 'id': 'Stanford OpenClassroom',
2951 'upload_date': None,
2954 self.report_download_webpage(info['id'])
2955 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2957 rootpage = compat_urllib_request.urlopen(rootURL).read()
2958 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2959 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2962 info['title'] = info['id']
2964 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2967 'type': 'reference',
2968 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2973 for entry in info['list']:
2974 assert entry['type'] == 'reference'
2975 results += self.extract(entry['url'])
2978 class MTVIE(InfoExtractor):
2979 """Information extractor for MTV.com"""
2981 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2984 def report_extraction(self, video_id):
2985 """Report information extraction."""
2986 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2988 def _real_extract(self, url):
2989 mobj = re.match(self._VALID_URL, url)
2991 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2993 if not mobj.group('proto'):
2994 url = 'http://' + url
2995 video_id = mobj.group('videoid')
2997 webpage = self._download_webpage(url, video_id)
2999 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3001 self._downloader.trouble(u'ERROR: unable to extract song name')
3003 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3004 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3006 self._downloader.trouble(u'ERROR: unable to extract performer')
3008 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3009 video_title = performer + ' - ' + song_name
3011 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3013 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3015 mtvn_uri = mobj.group(1)
3017 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3019 self._downloader.trouble(u'ERROR: unable to extract content id')
3021 content_id = mobj.group(1)
3023 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3024 self.report_extraction(video_id)
3025 request = compat_urllib_request.Request(videogen_url)
3027 metadataXml = compat_urllib_request.urlopen(request).read()
3028 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3029 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3032 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3033 renditions = mdoc.findall('.//rendition')
3035 # For now, always pick the highest quality.
3036 rendition = renditions[-1]
3039 _,_,ext = rendition.attrib['type'].partition('/')
3040 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3041 video_url = rendition.find('./src').text
3043 self._downloader.trouble('Invalid rendition field.')
3049 'uploader': performer,
3050 'upload_date': None,
3051 'title': video_title,
3059 class YoukuIE(InfoExtractor):
3060 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3062 def report_download_webpage(self, file_id):
3063 """Report webpage download."""
3064 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3066 def report_extraction(self, file_id):
3067 """Report information extraction."""
3068 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3071 nowTime = int(time.time() * 1000)
3072 random1 = random.randint(1000,1998)
3073 random2 = random.randint(1000,9999)
3075 return "%d%d%d" %(nowTime,random1,random2)
3077 def _get_file_ID_mix_string(self, seed):
3079 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3081 for i in range(len(source)):
3082 seed = (seed * 211 + 30031 ) % 65536
3083 index = math.floor(seed / 65536 * len(source) )
3084 mixed.append(source[int(index)])
3085 source.remove(source[int(index)])
3086 #return ''.join(mixed)
3089 def _get_file_id(self, fileId, seed):
3090 mixed = self._get_file_ID_mix_string(seed)
3091 ids = fileId.split('*')
3095 realId.append(mixed[int(ch)])
3096 return ''.join(realId)
3098 def _real_extract(self, url):
3099 mobj = re.match(self._VALID_URL, url)
3101 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3103 video_id = mobj.group('ID')
3105 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3107 request = compat_urllib_request.Request(info_url, None, std_headers)
3109 self.report_download_webpage(video_id)
3110 jsondata = compat_urllib_request.urlopen(request).read()
3111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3112 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3115 self.report_extraction(video_id)
3117 jsonstr = jsondata.decode('utf-8')
3118 config = json.loads(jsonstr)
3120 video_title = config['data'][0]['title']
3121 seed = config['data'][0]['seed']
3123 format = self._downloader.params.get('format', None)
3124 supported_format = list(config['data'][0]['streamfileids'].keys())
3126 if format is None or format == 'best':
3127 if 'hd2' in supported_format:
3132 elif format == 'worst':
3140 fileid = config['data'][0]['streamfileids'][format]
3141 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3142 except (UnicodeDecodeError, ValueError, KeyError):
3143 self._downloader.trouble(u'ERROR: unable to extract info section')
3147 sid = self._gen_sid()
3148 fileid = self._get_file_id(fileid, seed)
3150 #column 8,9 of fileid represent the segment number
3151 #fileid[7:9] should be changed
3152 for index, key in enumerate(keys):
3154 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3155 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3158 'id': '%s_part%02d' % (video_id, index),
3159 'url': download_url,
3161 'upload_date': None,
3162 'title': video_title,
3165 files_info.append(info)
3170 class XNXXIE(InfoExtractor):
3171 """Information extractor for xnxx.com"""
3173 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3175 VIDEO_URL_RE = r'flv_url=(.*?)&'
3176 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3177 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3179 def report_webpage(self, video_id):
3180 """Report information extraction"""
3181 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3183 def report_extraction(self, video_id):
3184 """Report information extraction"""
3185 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3187 def _real_extract(self, url):
3188 mobj = re.match(self._VALID_URL, url)
3190 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3192 video_id = mobj.group(1)
3194 self.report_webpage(video_id)
3196 # Get webpage content
3198 webpage_bytes = compat_urllib_request.urlopen(url).read()
3199 webpage = webpage_bytes.decode('utf-8')
3200 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3201 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3204 result = re.search(self.VIDEO_URL_RE, webpage)
3206 self._downloader.trouble(u'ERROR: unable to extract video url')
3208 video_url = compat_urllib_parse.unquote(result.group(1))
3210 result = re.search(self.VIDEO_TITLE_RE, webpage)
3212 self._downloader.trouble(u'ERROR: unable to extract video title')
3214 video_title = result.group(1)
3216 result = re.search(self.VIDEO_THUMB_RE, webpage)
3218 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3220 video_thumbnail = result.group(1)
3226 'upload_date': None,
3227 'title': video_title,
3229 'thumbnail': video_thumbnail,
3230 'description': None,
3234 class GooglePlusIE(InfoExtractor):
3235 """Information extractor for plus.google.com."""
3237 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3238 IE_NAME = u'plus.google'
3240 def __init__(self, downloader=None):
3241 InfoExtractor.__init__(self, downloader)
3243 def report_extract_entry(self, url):
3244 """Report downloading extry"""
3245 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3247 def report_date(self, upload_date):
3248 """Report downloading extry"""
3249 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3251 def report_uploader(self, uploader):
3252 """Report downloading extry"""
3253 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3255 def report_title(self, video_title):
3256 """Report downloading extry"""
3257 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3259 def report_extract_vid_page(self, video_page):
3260 """Report information extraction."""
3261 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3263 def _real_extract(self, url):
3264 # Extract id from URL
3265 mobj = re.match(self._VALID_URL, url)
3267 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3270 post_url = mobj.group(0)
3271 video_id = mobj.group(1)
3273 video_extension = 'flv'
3275 # Step 1, Retrieve post webpage to extract further information
3276 self.report_extract_entry(post_url)
3277 request = compat_urllib_request.Request(post_url)
3279 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3280 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3281 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3284 # Extract update date
3286 pattern = 'title="Timestamp">(.*?)</a>'
3287 mobj = re.search(pattern, webpage)
3289 upload_date = mobj.group(1)
3290 # Convert timestring to a format suitable for filename
3291 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3292 upload_date = upload_date.strftime('%Y%m%d')
3293 self.report_date(upload_date)
3297 pattern = r'rel\="author".*?>(.*?)</a>'
3298 mobj = re.search(pattern, webpage)
3300 uploader = mobj.group(1)
3301 self.report_uploader(uploader)
3304 # Get the first line for title
3306 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3307 mobj = re.search(pattern, webpage)
3309 video_title = mobj.group(1)
3310 self.report_title(video_title)
3312 # Step 2, Stimulate clicking the image box to launch video
3313 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3314 mobj = re.search(pattern, webpage)
3316 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3318 video_page = mobj.group(1)
3319 request = compat_urllib_request.Request(video_page)
3321 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3322 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3323 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3325 self.report_extract_vid_page(video_page)
3328 # Extract video links on video page
3329 """Extract video links of all sizes"""
3330 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3331 mobj = re.findall(pattern, webpage)
3333 self._downloader.trouble(u'ERROR: unable to extract video links')
3335 # Sort in resolution
3336 links = sorted(mobj)
3338 # Choose the lowest of the sort, i.e. highest resolution
3339 video_url = links[-1]
3340 # Only get the url. The resolution part in the tuple has no use anymore
3341 video_url = video_url[-1]
3342 # Treat escaped \u0026 style hex
3344 video_url = video_url.decode("unicode_escape")
3345 except AttributeError: # Python 3
3346 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3352 'uploader': uploader,
3353 'upload_date': upload_date,
3354 'title': video_title,
3355 'ext': video_extension,
3358 class NBAIE(InfoExtractor):
3359 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3362 def _real_extract(self, url):
3363 mobj = re.match(self._VALID_URL, url)
3365 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3368 video_id = mobj.group(1)
3369 if video_id.endswith('/index.html'):
3370 video_id = video_id[:-len('/index.html')]
3372 webpage = self._download_webpage(url, video_id)
3374 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3375 def _findProp(rexp, default=None):
3376 m = re.search(rexp, webpage)
3378 return unescapeHTML(m.group(1))
3382 shortened_video_id = video_id.rpartition('/')[2]
3383 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3385 'id': shortened_video_id,
3389 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3390 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3394 class JustinTVIE(InfoExtractor):
3395 """Information extractor for justin.tv and twitch.tv"""
3396 # TODO: One broadcast may be split into multiple videos. The key
3397 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3398 # starts at 1 and increases. Can we treat all parts as one video?
3400 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3401 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3402 _JUSTIN_PAGE_LIMIT = 100
3403 IE_NAME = u'justin.tv'
3405 def report_extraction(self, file_id):
3406 """Report information extraction."""
3407 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3409 def report_download_page(self, channel, offset):
3410 """Report attempt to download a single page of videos."""
3411 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3412 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3414 # Return count of items, list of *valid* items
3415 def _parse_page(self, url):
3417 urlh = compat_urllib_request.urlopen(url)
3418 webpage_bytes = urlh.read()
3419 webpage = webpage_bytes.decode('utf-8', 'ignore')
3420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3421 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3424 response = json.loads(webpage)
3425 if type(response) != list:
3426 error_text = response.get('error', 'unknown error')
3427 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3430 for clip in response:
3431 video_url = clip['video_file_url']
3433 video_extension = os.path.splitext(video_url)[1][1:]
3434 video_date = re.sub('-', '', clip['start_time'][:10])
3435 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3436 video_id = clip['id']
3437 video_title = clip.get('title', video_id)
3441 'title': video_title,
3442 'uploader': clip.get('channel_name', video_uploader_id),
3443 'uploader_id': video_uploader_id,
3444 'upload_date': video_date,
3445 'ext': video_extension,
3447 return (len(response), info)
3449 def _real_extract(self, url):
3450 mobj = re.match(self._VALID_URL, url)
3452 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3455 api = 'http://api.justin.tv'
3456 video_id = mobj.group(mobj.lastindex)
3458 if mobj.lastindex == 1:
3460 api += '/channel/archives/%s.json'
3462 api += '/broadcast/by_archive/%s.json'
3463 api = api % (video_id,)
3465 self.report_extraction(video_id)
3469 limit = self._JUSTIN_PAGE_LIMIT
3472 self.report_download_page(video_id, offset)
3473 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3474 page_count, page_info = self._parse_page(page_url)
3475 info.extend(page_info)
3476 if not paged or page_count != limit:
3481 class FunnyOrDieIE(InfoExtractor):
3482 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3484 def _real_extract(self, url):
3485 mobj = re.match(self._VALID_URL, url)
3487 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3490 video_id = mobj.group('id')
3491 webpage = self._download_webpage(url, video_id)
3493 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3495 self._downloader.trouble(u'ERROR: unable to find video information')
3496 video_url = unescapeHTML(m.group('url'))
3498 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3500 self._downloader.trouble(u'Cannot find video title')
3501 title = unescapeHTML(m.group('title'))
3503 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3505 desc = unescapeHTML(m.group('desc'))
3514 'description': desc,
3518 class TweetReelIE(InfoExtractor):
3519 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3521 def _real_extract(self, url):
3522 mobj = re.match(self._VALID_URL, url)
3524 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3527 video_id = mobj.group('id')
3528 webpage = self._download_webpage(url, video_id)
3530 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3532 self._downloader.trouble(u'ERROR: Cannot find status ID')
3533 status_id = m.group(1)
3535 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3537 self._downloader.trouble(u'WARNING: Cannot find description')
3538 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3540 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3542 self._downloader.trouble(u'ERROR: Cannot find uploader')
3543 uploader = unescapeHTML(m.group('uploader'))
3544 uploader_id = unescapeHTML(m.group('uploader_id'))
3546 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3548 self._downloader.trouble(u'ERROR: Cannot find upload date')
3549 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3552 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3559 'description': desc,
3560 'uploader': uploader,
3561 'uploader_id': uploader_id,
3562 'internal_id': status_id,
3563 'upload_date': upload_date
3567 class SteamIE(InfoExtractor):
3568 _VALID_URL = r"""http://store.steampowered.com/
3569 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3571 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3574 def suitable(self, url):
3575 """Receives a URL and returns True if suitable for this IE."""
3576 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3578 def _real_extract(self, url):
3579 m = re.match(self._VALID_URL, url, re.VERBOSE)
3580 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3581 gameID = m.group('gameID')
3582 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3583 webpage = self._download_webpage(videourl, gameID)
3584 mweb = re.finditer(urlRE, webpage)
3585 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3586 titles = re.finditer(namesRE, webpage)
3588 for vid,vtitle in zip(mweb,titles):
3589 video_id = vid.group('videoID')
3590 title = vtitle.group('videoName')
3591 video_url = vid.group('videoURL')
3593 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3598 'title': unescapeHTML(title)
3603 class UstreamIE(InfoExtractor):
3604 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3605 IE_NAME = u'ustream'
3607 def _real_extract(self, url):
3608 m = re.match(self._VALID_URL, url)
3609 video_id = m.group('videoID')
3610 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3611 webpage = self._download_webpage(url, video_id)
3612 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3613 title = m.group('title')
3614 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3615 uploader = m.group('uploader')
3621 'uploader': uploader
3625 class RBMARadioIE(InfoExtractor):
3626 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3628 def _real_extract(self, url):
3629 m = re.match(self._VALID_URL, url)
3630 video_id = m.group('videoID')
3632 webpage = self._download_webpage(url, video_id)
3633 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3635 raise ExtractorError(u'Cannot find metadata')
3636 json_data = m.group(1)
3639 data = json.loads(json_data)
3640 except ValueError as e:
3641 raise ExtractorError(u'Invalid JSON: ' + str(e))
3643 video_url = data['akamai_url'] + '&cbr=256'
3644 url_parts = compat_urllib_parse_urlparse(video_url)
3645 video_ext = url_parts.path.rpartition('.')[2]
3650 'title': data['title'],
3651 'description': data.get('teaser_text'),
3652 'location': data.get('country_of_origin'),
3653 'uploader': data.get('host', {}).get('name'),
3654 'uploader_id': data.get('host', {}).get('slug'),
3655 'thumbnail': data.get('image', {}).get('large_url_2x'),
3656 'duration': data.get('duration'),
3661 class YouPornIE(InfoExtractor):
3662 """Information extractor for youporn.com."""
3663 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3665 def _print_formats(self, formats):
3666 """Print all available formats"""
3667 print(u'Available formats:')
3668 print(u'ext\t\tformat')
3669 print(u'---------------------------------')
3670 for format in formats:
3671 print(u'%s\t\t%s' % (format['ext'], format['format']))
3673 def _specific(self, req_format, formats):
3675 if(x["format"]==req_format):
3679 def _real_extract(self, url):
3680 mobj = re.match(self._VALID_URL, url)
3682 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3685 video_id = mobj.group('videoid')
3687 req = compat_urllib_request.Request(url)
3688 req.add_header('Cookie', 'age_verified=1')
3689 webpage = self._download_webpage(req, video_id)
3691 # Get the video title
3692 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3694 raise ExtractorError(u'ERROR: unable to extract video title')
3695 video_title = result.group('title').strip()
3697 # Get the video date
3698 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3700 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3703 upload_date = result.group('date').strip()
3705 # Get the video uploader
3706 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3708 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3709 video_uploader = None
3711 video_uploader = result.group('uploader').strip()
3712 video_uploader = clean_html( video_uploader )
3714 # Get all of the formats available
3715 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3716 result = re.search(DOWNLOAD_LIST_RE, webpage)
3718 raise ExtractorError(u'Unable to extract download list')
3719 download_list_html = result.group('download_list').strip()
3721 # Get all of the links from the page
3722 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3723 links = re.findall(LINK_RE, download_list_html)
3724 if(len(links) == 0):
3725 raise ExtractorError(u'ERROR: no known formats available for video')
3727 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3732 # A link looks like this:
3733 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3734 # A path looks like this:
3735 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3736 video_url = unescapeHTML( link )
3737 path = compat_urllib_parse_urlparse( video_url ).path
3738 extension = os.path.splitext( path )[1][1:]
3739 format = path.split('/')[4].split('_')[:2]
3742 format = "-".join( format )
3743 title = u'%s-%s-%s' % (video_title, size, bitrate)
3748 'uploader': video_uploader,
3749 'upload_date': upload_date,
3754 'description': None,
3758 if self._downloader.params.get('listformats', None):
3759 self._print_formats(formats)
3762 req_format = self._downloader.params.get('format', None)
3763 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3765 if req_format is None or req_format == 'best':
3767 elif req_format == 'worst':
3768 return [formats[-1]]
3769 elif req_format in ('-1', 'all'):
3772 format = self._specific( req_format, formats )
3774 self._downloader.trouble(u'ERROR: requested format not available')
3780 class PornotubeIE(InfoExtractor):
3781 """Information extractor for pornotube.com."""
3782 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3784 def _real_extract(self, url):
3785 mobj = re.match(self._VALID_URL, url)
3787 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3790 video_id = mobj.group('videoid')
3791 video_title = mobj.group('title')
3793 # Get webpage content
3794 webpage = self._download_webpage(url, video_id)
3797 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3798 result = re.search(VIDEO_URL_RE, webpage)
3800 self._downloader.trouble(u'ERROR: unable to extract video url')
3802 video_url = compat_urllib_parse.unquote(result.group('url'))
3804 #Get the uploaded date
3805 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3806 result = re.search(VIDEO_UPLOADED_RE, webpage)
3808 self._downloader.trouble(u'ERROR: unable to extract video title')
3810 upload_date = result.group('date')
3812 info = {'id': video_id,
3815 'upload_date': upload_date,
3816 'title': video_title,
3822 class YouJizzIE(InfoExtractor):
3823 """Information extractor for youjizz.com."""
3824 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3826 def _real_extract(self, url):
3827 mobj = re.match(self._VALID_URL, url)
3829 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3832 video_id = mobj.group('videoid')
3834 # Get webpage content
3835 webpage = self._download_webpage(url, video_id)
3837 # Get the video title
3838 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3840 raise ExtractorError(u'ERROR: unable to extract video title')
3841 video_title = result.group('title').strip()
3843 # Get the embed page
3844 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3846 raise ExtractorError(u'ERROR: unable to extract embed page')
3848 embed_page_url = result.group(0).strip()
3849 video_id = result.group('videoid')
3851 webpage = self._download_webpage(embed_page_url, video_id)
3854 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3856 raise ExtractorError(u'ERROR: unable to extract video url')
3857 video_url = result.group('source')
3859 info = {'id': video_id,
3861 'title': video_title,
3864 'player_url': embed_page_url}
3868 class EightTracksIE(InfoExtractor):
3870 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3872 def _real_extract(self, url):
3873 mobj = re.match(self._VALID_URL, url)
3875 raise ExtractorError(u'Invalid URL: %s' % url)
3876 playlist_id = mobj.group('id')
3878 webpage = self._download_webpage(url, playlist_id)
3880 m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3882 raise ExtractorError(u'Cannot find trax information')
3883 json_like = m.group(1)
3884 data = json.loads(json_like)
3886 session = str(random.randint(0, 1000000000))
3888 track_count = data['tracks_count']
3889 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3890 next_url = first_url
3892 for i in itertools.count():
3893 api_json = self._download_webpage(next_url, playlist_id,
3894 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3895 errnote=u'Failed to download song information')
3896 api_data = json.loads(api_json)
3897 track_data = api_data[u'set']['track']
3899 'id': track_data['id'],
3900 'url': track_data['track_file_stream_url'],
3901 'title': track_data['performer'] + u' - ' + track_data['name'],
3902 'raw_title': track_data['name'],
3903 'uploader_id': data['user']['login'],
3907 if api_data['set']['at_last_track']:
3909 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3912 def gen_extractors():
3913 """ Return a list of an instance of every supported extractor.
3914 The order does matter; the first extractor matched is the one handling the URL.
3917 YoutubePlaylistIE(),
3941 StanfordOpenClassroomIE(),