2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
272 url = 'http://www.youtube.com/api/timedtext?' + params
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
281 def _print_formats(self, formats):
282 print('Available formats:')
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
286 def _real_initialize(self):
287 if self._downloader is None:
292 downloader_params = self._downloader.params
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
311 request = compat_urllib_request.Request(self._LANG_URL)
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
319 # No authentication to be performed
325 'current_form': 'loginForm',
327 'action_login': 'Log In',
328 'username': username,
329 'password': password,
331 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
334 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
335 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
336 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
338 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
339 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
345 'action_confirm': 'Confirm',
347 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
349 self.report_age_confirmation()
350 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
352 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
355 def _extract_id(self, url):
356 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
358 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
360 video_id = mobj.group(2)
363 def _real_extract(self, url):
364 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
365 mobj = re.search(self._NEXT_URL_RE, url)
367 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
368 video_id = self._extract_id(url)
371 self.report_video_webpage_download(video_id)
372 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
373 request = compat_urllib_request.Request(url)
375 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
377 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
380 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
382 # Attempt to extract SWF player URL
383 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
385 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
390 self.report_video_info_webpage_download(video_id)
391 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
392 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
393 % (video_id, el_type))
394 request = compat_urllib_request.Request(video_info_url)
396 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
397 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
398 video_info = compat_parse_qs(video_info_webpage)
399 if 'token' in video_info:
401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
402 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
404 if 'token' not in video_info:
405 if 'reason' in video_info:
406 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
408 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
411 # Check for "rental" videos
412 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
413 self._downloader.trouble(u'ERROR: "rental" videos not supported')
416 # Start extracting information
417 self.report_information_extraction(video_id)
420 if 'author' not in video_info:
421 self._downloader.trouble(u'ERROR: unable to extract uploader name')
423 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
426 video_uploader_id = None
427 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
429 video_uploader_id = mobj.group(1)
431 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
434 if 'title' not in video_info:
435 self._downloader.trouble(u'ERROR: unable to extract video title')
437 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
440 if 'thumbnail_url' not in video_info:
441 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
443 else: # don't panic if we can't find it
444 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
448 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
450 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
451 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
452 for expression in format_expressions:
454 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
459 video_description = get_element_by_id("eow-description", video_webpage)
460 if video_description:
461 video_description = clean_html(video_description)
463 video_description = ''
466 video_subtitles = None
467 if self._downloader.params.get('writesubtitles', False):
468 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
470 self._downloader.trouble(srt_error)
472 if 'length_seconds' not in video_info:
473 self._downloader.trouble(u'WARNING: unable to extract video duration')
476 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
479 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
481 # Decide which formats to download
482 req_format = self._downloader.params.get('format', None)
484 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
485 self.report_rtmp_download()
486 video_url_list = [(None, video_info['conn'][0])]
487 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
488 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
489 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
490 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
491 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
493 format_limit = self._downloader.params.get('format_limit', None)
494 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
495 if format_limit is not None and format_limit in available_formats:
496 format_list = available_formats[available_formats.index(format_limit):]
498 format_list = available_formats
499 existing_formats = [x for x in format_list if x in url_map]
500 if len(existing_formats) == 0:
501 self._downloader.trouble(u'ERROR: no known formats available for video')
503 if self._downloader.params.get('listformats', None):
504 self._print_formats(existing_formats)
506 if req_format is None or req_format == 'best':
507 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
508 elif req_format == 'worst':
509 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
510 elif req_format in ('-1', 'all'):
511 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
513 # Specific formats. We pick the first in a slash-delimeted sequence.
514 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
515 req_formats = req_format.split('/')
516 video_url_list = None
517 for rf in req_formats:
519 video_url_list = [(rf, url_map[rf])]
521 if video_url_list is None:
522 self._downloader.trouble(u'ERROR: requested format not available')
525 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
529 for format_param, video_real_url in video_url_list:
531 video_extension = self._video_extensions.get(format_param, 'flv')
533 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
534 self._video_dimensions.get(format_param, '???'))
538 'url': video_real_url,
539 'uploader': video_uploader,
540 'uploader_id': video_uploader_id,
541 'upload_date': upload_date,
542 'title': video_title,
543 'ext': video_extension,
544 'format': video_format,
545 'thumbnail': video_thumbnail,
546 'description': video_description,
547 'player_url': player_url,
548 'subtitles': video_subtitles,
549 'duration': video_duration
554 class MetacafeIE(InfoExtractor):
555 """Information Extractor for metacafe.com."""
557 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
558 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
559 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
560 IE_NAME = u'metacafe'
562 def __init__(self, downloader=None):
563 InfoExtractor.__init__(self, downloader)
565 def report_disclaimer(self):
566 """Report disclaimer retrieval."""
567 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
569 def report_age_confirmation(self):
570 """Report attempt to confirm age."""
571 self._downloader.to_screen(u'[metacafe] Confirming age')
573 def report_download_webpage(self, video_id):
574 """Report webpage download."""
575 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
577 def report_extraction(self, video_id):
578 """Report information extraction."""
579 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
581 def _real_initialize(self):
582 # Retrieve disclaimer
583 request = compat_urllib_request.Request(self._DISCLAIMER)
585 self.report_disclaimer()
586 disclaimer = compat_urllib_request.urlopen(request).read()
587 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
588 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
594 'submit': "Continue - I'm over 18",
596 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
598 self.report_age_confirmation()
599 disclaimer = compat_urllib_request.urlopen(request).read()
600 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
601 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
604 def _real_extract(self, url):
605 # Extract id and simplified title from URL
606 mobj = re.match(self._VALID_URL, url)
608 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
611 video_id = mobj.group(1)
613 # Check if video comes from YouTube
614 mobj2 = re.match(r'^yt-(.*)$', video_id)
615 if mobj2 is not None:
616 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
619 # Retrieve video webpage to extract further information
620 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
622 self.report_download_webpage(video_id)
623 webpage = compat_urllib_request.urlopen(request).read()
624 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
625 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
628 # Extract URL, uploader and title from webpage
629 self.report_extraction(video_id)
630 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
632 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
633 video_extension = mediaURL[-3:]
635 # Extract gdaKey if available
636 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
640 gdaKey = mobj.group(1)
641 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
643 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
645 self._downloader.trouble(u'ERROR: unable to extract media URL')
647 vardict = compat_parse_qs(mobj.group(1))
648 if 'mediaData' not in vardict:
649 self._downloader.trouble(u'ERROR: unable to extract media URL')
651 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
653 self._downloader.trouble(u'ERROR: unable to extract media URL')
655 mediaURL = mobj.group(1).replace('\\/', '/')
656 video_extension = mediaURL[-3:]
657 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
659 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
661 self._downloader.trouble(u'ERROR: unable to extract title')
663 video_title = mobj.group(1).decode('utf-8')
665 mobj = re.search(r'submitter=(.*?);', webpage)
667 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
669 video_uploader = mobj.group(1)
672 'id': video_id.decode('utf-8'),
673 'url': video_url.decode('utf-8'),
674 'uploader': video_uploader.decode('utf-8'),
676 'title': video_title,
677 'ext': video_extension.decode('utf-8'),
681 class DailymotionIE(InfoExtractor):
682 """Information Extractor for Dailymotion"""
684 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
685 IE_NAME = u'dailymotion'
687 def __init__(self, downloader=None):
688 InfoExtractor.__init__(self, downloader)
690 def report_extraction(self, video_id):
691 """Report information extraction."""
692 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
694 def _real_extract(self, url):
695 # Extract id and simplified title from URL
696 mobj = re.match(self._VALID_URL, url)
698 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
701 video_id = mobj.group(1).split('_')[0].split('?')[0]
703 video_extension = 'mp4'
705 # Retrieve video webpage to extract further information
706 request = compat_urllib_request.Request(url)
707 request.add_header('Cookie', 'family_filter=off')
708 webpage = self._download_webpage(request, video_id)
710 # Extract URL, uploader and title from webpage
711 self.report_extraction(video_id)
712 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
714 self._downloader.trouble(u'ERROR: unable to extract media URL')
716 flashvars = compat_urllib_parse.unquote(mobj.group(1))
718 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
721 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
724 self._downloader.trouble(u'ERROR: unable to extract video URL')
727 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
729 self._downloader.trouble(u'ERROR: unable to extract video URL')
732 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
734 # TODO: support choosing qualities
736 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
738 self._downloader.trouble(u'ERROR: unable to extract title')
740 video_title = unescapeHTML(mobj.group('title'))
742 video_uploader = None
743 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
745 # lookin for official user
746 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
747 if mobj_official is None:
748 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
750 video_uploader = mobj_official.group(1)
752 video_uploader = mobj.group(1)
754 video_upload_date = None
755 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
757 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
762 'uploader': video_uploader,
763 'upload_date': video_upload_date,
764 'title': video_title,
765 'ext': video_extension,
769 class PhotobucketIE(InfoExtractor):
770 """Information extractor for photobucket.com."""
772 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
773 IE_NAME = u'photobucket'
775 def __init__(self, downloader=None):
776 InfoExtractor.__init__(self, downloader)
778 def report_download_webpage(self, video_id):
779 """Report webpage download."""
780 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
782 def report_extraction(self, video_id):
783 """Report information extraction."""
784 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
786 def _real_extract(self, url):
787 # Extract id from URL
788 mobj = re.match(self._VALID_URL, url)
790 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
793 video_id = mobj.group(1)
795 video_extension = 'flv'
797 # Retrieve video webpage to extract further information
798 request = compat_urllib_request.Request(url)
800 self.report_download_webpage(video_id)
801 webpage = compat_urllib_request.urlopen(request).read()
802 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
803 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
806 # Extract URL, uploader, and title from webpage
807 self.report_extraction(video_id)
808 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
810 self._downloader.trouble(u'ERROR: unable to extract media URL')
812 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
816 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
818 self._downloader.trouble(u'ERROR: unable to extract title')
820 video_title = mobj.group(1).decode('utf-8')
822 video_uploader = mobj.group(2).decode('utf-8')
825 'id': video_id.decode('utf-8'),
826 'url': video_url.decode('utf-8'),
827 'uploader': video_uploader,
829 'title': video_title,
830 'ext': video_extension.decode('utf-8'),
834 class YahooIE(InfoExtractor):
835 """Information extractor for video.yahoo.com."""
838 # _VALID_URL matches all Yahoo! Video URLs
839 # _VPAGE_URL matches only the extractable '/watch/' URLs
840 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
841 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
842 IE_NAME = u'video.yahoo'
844 def __init__(self, downloader=None):
845 InfoExtractor.__init__(self, downloader)
847 def report_download_webpage(self, video_id):
848 """Report webpage download."""
849 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
851 def report_extraction(self, video_id):
852 """Report information extraction."""
853 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
855 def _real_extract(self, url, new_video=True):
856 # Extract ID from URL
857 mobj = re.match(self._VALID_URL, url)
859 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
862 video_id = mobj.group(2)
863 video_extension = 'flv'
865 # Rewrite valid but non-extractable URLs as
866 # extractable English language /watch/ URLs
867 if re.match(self._VPAGE_URL, url) is None:
868 request = compat_urllib_request.Request(url)
870 webpage = compat_urllib_request.urlopen(request).read()
871 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
872 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
875 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
877 self._downloader.trouble(u'ERROR: Unable to extract id field')
879 yahoo_id = mobj.group(1)
881 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
883 self._downloader.trouble(u'ERROR: Unable to extract vid field')
885 yahoo_vid = mobj.group(1)
887 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
888 return self._real_extract(url, new_video=False)
890 # Retrieve video webpage to extract further information
891 request = compat_urllib_request.Request(url)
893 self.report_download_webpage(video_id)
894 webpage = compat_urllib_request.urlopen(request).read()
895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
896 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
899 # Extract uploader and title from webpage
900 self.report_extraction(video_id)
901 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
903 self._downloader.trouble(u'ERROR: unable to extract video title')
905 video_title = mobj.group(1).decode('utf-8')
907 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
909 self._downloader.trouble(u'ERROR: unable to extract video uploader')
911 video_uploader = mobj.group(1).decode('utf-8')
913 # Extract video thumbnail
914 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
916 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
918 video_thumbnail = mobj.group(1).decode('utf-8')
920 # Extract video description
921 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
923 self._downloader.trouble(u'ERROR: unable to extract video description')
925 video_description = mobj.group(1).decode('utf-8')
926 if not video_description:
927 video_description = 'No description available.'
929 # Extract video height and width
930 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
932 self._downloader.trouble(u'ERROR: unable to extract video height')
934 yv_video_height = mobj.group(1)
936 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
938 self._downloader.trouble(u'ERROR: unable to extract video width')
940 yv_video_width = mobj.group(1)
942 # Retrieve video playlist to extract media URL
943 # I'm not completely sure what all these options are, but we
944 # seem to need most of them, otherwise the server sends a 401.
945 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
946 yv_bitrate = '700' # according to Wikipedia this is hard-coded
947 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
948 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
949 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
951 self.report_download_webpage(video_id)
952 webpage = compat_urllib_request.urlopen(request).read()
953 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
954 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
957 # Extract media URL from playlist XML
958 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
960 self._downloader.trouble(u'ERROR: Unable to extract media URL')
962 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
963 video_url = unescapeHTML(video_url)
966 'id': video_id.decode('utf-8'),
968 'uploader': video_uploader,
970 'title': video_title,
971 'ext': video_extension.decode('utf-8'),
972 'thumbnail': video_thumbnail.decode('utf-8'),
973 'description': video_description,
977 class VimeoIE(InfoExtractor):
978 """Information extractor for vimeo.com."""
980 # _VALID_URL matches Vimeo URLs
981 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
984 def __init__(self, downloader=None):
985 InfoExtractor.__init__(self, downloader)
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
989 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
991 def report_extraction(self, video_id):
992 """Report information extraction."""
993 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
995 def _real_extract(self, url, new_video=True):
996 # Extract ID from URL
997 mobj = re.match(self._VALID_URL, url)
999 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1002 video_id = mobj.group(1)
1004 # Retrieve video webpage to extract further information
1005 request = compat_urllib_request.Request(url, None, std_headers)
1007 self.report_download_webpage(video_id)
1008 webpage_bytes = compat_urllib_request.urlopen(request).read()
1009 webpage = webpage_bytes.decode('utf-8')
1010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1014 # Now we begin extracting as much information as we can from what we
1015 # retrieved. First we extract the information common to all extractors,
1016 # and latter we extract those that are Vimeo specific.
1017 self.report_extraction(video_id)
1019 # Extract the config JSON
1021 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1022 config = json.loads(config)
1024 self._downloader.trouble(u'ERROR: unable to extract info section')
1028 video_title = config["video"]["title"]
1030 # Extract uploader and uploader_id
1031 video_uploader = config["video"]["owner"]["name"]
1032 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1034 # Extract video thumbnail
1035 video_thumbnail = config["video"]["thumbnail"]
1037 # Extract video description
1038 video_description = get_element_by_attribute("itemprop", "description", webpage)
1039 if video_description: video_description = clean_html(video_description)
1040 else: video_description = ''
1042 # Extract upload date
1043 video_upload_date = None
1044 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1045 if mobj is not None:
1046 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1048 # Vimeo specific: extract request signature and timestamp
1049 sig = config['request']['signature']
1050 timestamp = config['request']['timestamp']
1052 # Vimeo specific: extract video codec and quality information
1053 # First consider quality, then codecs, then take everything
1054 # TODO bind to format param
1055 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1056 files = { 'hd': [], 'sd': [], 'other': []}
1057 for codec_name, codec_extension in codecs:
1058 if codec_name in config["video"]["files"]:
1059 if 'hd' in config["video"]["files"][codec_name]:
1060 files['hd'].append((codec_name, codec_extension, 'hd'))
1061 elif 'sd' in config["video"]["files"][codec_name]:
1062 files['sd'].append((codec_name, codec_extension, 'sd'))
1064 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1066 for quality in ('hd', 'sd', 'other'):
1067 if len(files[quality]) > 0:
1068 video_quality = files[quality][0][2]
1069 video_codec = files[quality][0][0]
1070 video_extension = files[quality][0][1]
1071 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1074 self._downloader.trouble(u'ERROR: no known codec found')
1077 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1078 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1083 'uploader': video_uploader,
1084 'uploader_id': video_uploader_id,
1085 'upload_date': video_upload_date,
1086 'title': video_title,
1087 'ext': video_extension,
1088 'thumbnail': video_thumbnail,
1089 'description': video_description,
1093 class ArteTvIE(InfoExtractor):
1094 """arte.tv information extractor."""
1096 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1097 _LIVE_URL = r'index-[0-9]+\.html$'
1099 IE_NAME = u'arte.tv'
1101 def __init__(self, downloader=None):
1102 InfoExtractor.__init__(self, downloader)
1104 def report_download_webpage(self, video_id):
1105 """Report webpage download."""
1106 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1108 def report_extraction(self, video_id):
1109 """Report information extraction."""
1110 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1112 def fetch_webpage(self, url):
1113 request = compat_urllib_request.Request(url)
1115 self.report_download_webpage(url)
1116 webpage = compat_urllib_request.urlopen(request).read()
1117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1118 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1120 except ValueError as err:
1121 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1125 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1126 page = self.fetch_webpage(url)
1127 mobj = re.search(regex, page, regexFlags)
1131 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1134 for (i, key, err) in matchTuples:
1135 if mobj.group(i) is None:
1136 self._downloader.trouble(err)
1139 info[key] = mobj.group(i)
1143 def extractLiveStream(self, url):
1144 video_lang = url.split('/')[-4]
1145 info = self.grep_webpage(
1147 r'src="(.*?/videothek_js.*?\.js)',
1150 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1153 http_host = url.split('/')[2]
1154 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1155 info = self.grep_webpage(
1157 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1158 '(http://.*?\.swf).*?' +
1162 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1163 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1164 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1167 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1169 def extractPlus7Stream(self, url):
1170 video_lang = url.split('/')[-3]
1171 info = self.grep_webpage(
1173 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1176 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1179 next_url = compat_urllib_parse.unquote(info.get('url'))
1180 info = self.grep_webpage(
1182 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1185 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1188 next_url = compat_urllib_parse.unquote(info.get('url'))
1190 info = self.grep_webpage(
1192 r'<video id="(.*?)".*?>.*?' +
1193 '<name>(.*?)</name>.*?' +
1194 '<dateVideo>(.*?)</dateVideo>.*?' +
1195 '<url quality="hd">(.*?)</url>',
1198 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1199 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1200 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1201 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1206 'id': info.get('id'),
1207 'url': compat_urllib_parse.unquote(info.get('url')),
1208 'uploader': u'arte.tv',
1209 'upload_date': info.get('date'),
1210 'title': info.get('title').decode('utf-8'),
1216 def _real_extract(self, url):
1217 video_id = url.split('/')[-1]
1218 self.report_extraction(video_id)
1220 if re.search(self._LIVE_URL, video_id) is not None:
1221 self.extractLiveStream(url)
1224 info = self.extractPlus7Stream(url)
1229 class GenericIE(InfoExtractor):
1230 """Generic last-resort information extractor."""
1233 IE_NAME = u'generic'
1235 def __init__(self, downloader=None):
1236 InfoExtractor.__init__(self, downloader)
1238 def report_download_webpage(self, video_id):
1239 """Report webpage download."""
1240 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1241 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1243 def report_extraction(self, video_id):
1244 """Report information extraction."""
1245 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1247 def report_following_redirect(self, new_url):
1248 """Report information extraction."""
1249 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1251 def _test_redirect(self, url):
1252 """Check if it is a redirect, like url shorteners, in case restart chain."""
1253 class HeadRequest(compat_urllib_request.Request):
1254 def get_method(self):
1257 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1259 Subclass the HTTPRedirectHandler to make it use our
1260 HeadRequest also on the redirected URL
1262 def redirect_request(self, req, fp, code, msg, headers, newurl):
1263 if code in (301, 302, 303, 307):
1264 newurl = newurl.replace(' ', '%20')
1265 newheaders = dict((k,v) for k,v in req.headers.items()
1266 if k.lower() not in ("content-length", "content-type"))
1267 return HeadRequest(newurl,
1269 origin_req_host=req.get_origin_req_host(),
1272 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1274 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1276 Fallback to GET if HEAD is not allowed (405 HTTP error)
1278 def http_error_405(self, req, fp, code, msg, headers):
1282 newheaders = dict((k,v) for k,v in req.headers.items()
1283 if k.lower() not in ("content-length", "content-type"))
1284 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1286 origin_req_host=req.get_origin_req_host(),
1290 opener = compat_urllib_request.OpenerDirector()
1291 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1292 HTTPMethodFallback, HEADRedirectHandler,
1293 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1294 opener.add_handler(handler())
1296 response = opener.open(HeadRequest(url))
1297 new_url = response.geturl()
1302 self.report_following_redirect(new_url)
1303 self._downloader.download([new_url])
1306 def _real_extract(self, url):
1307 if self._test_redirect(url): return
1309 video_id = url.split('/')[-1]
1310 request = compat_urllib_request.Request(url)
1312 self.report_download_webpage(video_id)
1313 webpage = compat_urllib_request.urlopen(request).read()
1314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1315 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1317 except ValueError as err:
1318 # since this is the last-resort InfoExtractor, if
1319 # this error is thrown, it'll be thrown here
1320 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1323 self.report_extraction(video_id)
1324 # Start with something easy: JW Player in SWFObject
1325 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1327 # Broaden the search a little bit
1328 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1330 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1333 # It's possible that one of the regexes
1334 # matched, but returned an empty group:
1335 if mobj.group(1) is None:
1336 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1339 video_url = compat_urllib_parse.unquote(mobj.group(1))
1340 video_id = os.path.basename(video_url)
1342 # here's a fun little line of code for you:
1343 video_extension = os.path.splitext(video_id)[1][1:]
1344 video_id = os.path.splitext(video_id)[0]
1346 # it's tempting to parse this further, but you would
1347 # have to take into account all the variations like
1348 # Video Title - Site Name
1349 # Site Name | Video Title
1350 # Video Title - Tagline | Site Name
1351 # and so on and so forth; it's just not practical
1352 mobj = re.search(r'<title>(.*)</title>', webpage)
1354 self._downloader.trouble(u'ERROR: unable to extract title')
1356 video_title = mobj.group(1)
1358 # video uploader is domain name
1359 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1361 self._downloader.trouble(u'ERROR: unable to extract title')
1363 video_uploader = mobj.group(1)
1368 'uploader': video_uploader,
1369 'upload_date': None,
1370 'title': video_title,
1371 'ext': video_extension,
1375 class YoutubeSearchIE(InfoExtractor):
1376 """Information Extractor for YouTube search queries."""
1377 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1378 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1379 _max_youtube_results = 1000
1380 IE_NAME = u'youtube:search'
1382 def __init__(self, downloader=None):
1383 InfoExtractor.__init__(self, downloader)
1385 def report_download_page(self, query, pagenum):
1386 """Report attempt to download search page with given number."""
1387 query = query.decode(preferredencoding())
1388 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1390 def _real_extract(self, query):
1391 mobj = re.match(self._VALID_URL, query)
1393 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1396 prefix, query = query.split(':')
1398 query = query.encode('utf-8')
1400 self._download_n_results(query, 1)
1402 elif prefix == 'all':
1403 self._download_n_results(query, self._max_youtube_results)
1409 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1411 elif n > self._max_youtube_results:
1412 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1413 n = self._max_youtube_results
1414 self._download_n_results(query, n)
1416 except ValueError: # parsing prefix as integer fails
1417 self._download_n_results(query, 1)
1420 def _download_n_results(self, query, n):
1421 """Downloads a specified number of results for a query"""
1427 while (50 * pagenum) < limit:
1428 self.report_download_page(query, pagenum+1)
1429 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1430 request = compat_urllib_request.Request(result_url)
1432 data = compat_urllib_request.urlopen(request).read()
1433 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1434 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1436 api_response = json.loads(data)['data']
1438 new_ids = list(video['id'] for video in api_response['items'])
1439 video_ids += new_ids
1441 limit = min(n, api_response['totalItems'])
1444 if len(video_ids) > n:
1445 video_ids = video_ids[:n]
1446 for id in video_ids:
1447 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1451 class GoogleSearchIE(InfoExtractor):
1452 """Information Extractor for Google Video search queries."""
1453 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1454 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1455 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1456 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1457 _max_google_results = 1000
1458 IE_NAME = u'video.google:search'
1460 def __init__(self, downloader=None):
1461 InfoExtractor.__init__(self, downloader)
1463 def report_download_page(self, query, pagenum):
1464 """Report attempt to download playlist page with given number."""
1465 query = query.decode(preferredencoding())
1466 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1468 def _real_extract(self, query):
1469 mobj = re.match(self._VALID_URL, query)
1471 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1474 prefix, query = query.split(':')
1476 query = query.encode('utf-8')
1478 self._download_n_results(query, 1)
1480 elif prefix == 'all':
1481 self._download_n_results(query, self._max_google_results)
1487 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1489 elif n > self._max_google_results:
1490 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1491 n = self._max_google_results
1492 self._download_n_results(query, n)
1494 except ValueError: # parsing prefix as integer fails
1495 self._download_n_results(query, 1)
1498 def _download_n_results(self, query, n):
1499 """Downloads a specified number of results for a query"""
1505 self.report_download_page(query, pagenum)
1506 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1507 request = compat_urllib_request.Request(result_url)
1509 page = compat_urllib_request.urlopen(request).read()
1510 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1511 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1514 # Extract video identifiers
1515 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1516 video_id = mobj.group(1)
1517 if video_id not in video_ids:
1518 video_ids.append(video_id)
1519 if len(video_ids) == n:
1520 # Specified n videos reached
1521 for id in video_ids:
1522 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1525 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1526 for id in video_ids:
1527 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1530 pagenum = pagenum + 1
1533 class YahooSearchIE(InfoExtractor):
1534 """Information Extractor for Yahoo! Video search queries."""
1537 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1538 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1539 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1540 _MORE_PAGES_INDICATOR = r'\s*Next'
1541 _max_yahoo_results = 1000
1542 IE_NAME = u'video.yahoo:search'
1544 def __init__(self, downloader=None):
1545 InfoExtractor.__init__(self, downloader)
1547 def report_download_page(self, query, pagenum):
1548 """Report attempt to download playlist page with given number."""
1549 query = query.decode(preferredencoding())
1550 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1552 def _real_extract(self, query):
1553 mobj = re.match(self._VALID_URL, query)
1555 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1558 prefix, query = query.split(':')
1560 query = query.encode('utf-8')
1562 self._download_n_results(query, 1)
1564 elif prefix == 'all':
1565 self._download_n_results(query, self._max_yahoo_results)
1571 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1573 elif n > self._max_yahoo_results:
1574 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1575 n = self._max_yahoo_results
1576 self._download_n_results(query, n)
1578 except ValueError: # parsing prefix as integer fails
1579 self._download_n_results(query, 1)
1582 def _download_n_results(self, query, n):
1583 """Downloads a specified number of results for a query"""
1586 already_seen = set()
1590 self.report_download_page(query, pagenum)
1591 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1592 request = compat_urllib_request.Request(result_url)
1594 page = compat_urllib_request.urlopen(request).read()
1595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1596 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1599 # Extract video identifiers
1600 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1601 video_id = mobj.group(1)
1602 if video_id not in already_seen:
1603 video_ids.append(video_id)
1604 already_seen.add(video_id)
1605 if len(video_ids) == n:
1606 # Specified n videos reached
1607 for id in video_ids:
1608 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1611 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1612 for id in video_ids:
1613 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1616 pagenum = pagenum + 1
1619 class YoutubePlaylistIE(InfoExtractor):
1620 """Information Extractor for YouTube playlists."""
1622 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1623 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1624 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1625 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1626 IE_NAME = u'youtube:playlist'
1628 def __init__(self, downloader=None):
1629 InfoExtractor.__init__(self, downloader)
1631 def report_download_page(self, playlist_id, pagenum):
1632 """Report attempt to download playlist page with given number."""
1633 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1635 def _real_extract(self, url):
1636 # Extract playlist id
1637 mobj = re.match(self._VALID_URL, url)
1639 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1643 if mobj.group(3) is not None:
1644 self._downloader.download([mobj.group(3)])
1647 # Download playlist pages
1648 # prefix is 'p' as default for playlists but there are other types that need extra care
1649 playlist_prefix = mobj.group(1)
1650 if playlist_prefix == 'a':
1651 playlist_access = 'artist'
1653 playlist_prefix = 'p'
1654 playlist_access = 'view_play_list'
1655 playlist_id = mobj.group(2)
1660 self.report_download_page(playlist_id, pagenum)
1661 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1662 request = compat_urllib_request.Request(url)
1664 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1665 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1666 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1669 # Extract video identifiers
1671 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1672 if mobj.group(1) not in ids_in_page:
1673 ids_in_page.append(mobj.group(1))
1674 video_ids.extend(ids_in_page)
1676 if self._MORE_PAGES_INDICATOR not in page:
1678 pagenum = pagenum + 1
1680 total = len(video_ids)
1682 playliststart = self._downloader.params.get('playliststart', 1) - 1
1683 playlistend = self._downloader.params.get('playlistend', -1)
1684 if playlistend == -1:
1685 video_ids = video_ids[playliststart:]
1687 video_ids = video_ids[playliststart:playlistend]
1689 if len(video_ids) == total:
1690 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1692 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1694 for id in video_ids:
1695 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1699 class YoutubeChannelIE(InfoExtractor):
1700 """Information Extractor for YouTube channels."""
1702 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1703 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1704 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1705 IE_NAME = u'youtube:channel'
1707 def report_download_page(self, channel_id, pagenum):
1708 """Report attempt to download channel page with given number."""
1709 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1711 def _real_extract(self, url):
1712 # Extract channel id
1713 mobj = re.match(self._VALID_URL, url)
1715 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1718 # Download channel pages
1719 channel_id = mobj.group(1)
1724 self.report_download_page(channel_id, pagenum)
1725 url = self._TEMPLATE_URL % (channel_id, pagenum)
1726 request = compat_urllib_request.Request(url)
1728 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1729 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1730 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1733 # Extract video identifiers
1735 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1736 if mobj.group(1) not in ids_in_page:
1737 ids_in_page.append(mobj.group(1))
1738 video_ids.extend(ids_in_page)
1740 if self._MORE_PAGES_INDICATOR not in page:
1742 pagenum = pagenum + 1
1744 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1746 for id in video_ids:
1747 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1751 class YoutubeUserIE(InfoExtractor):
1752 """Information Extractor for YouTube users."""
1754 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1755 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1756 _GDATA_PAGE_SIZE = 50
1757 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1758 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1759 IE_NAME = u'youtube:user'
1761 def __init__(self, downloader=None):
1762 InfoExtractor.__init__(self, downloader)
1764 def report_download_page(self, username, start_index):
1765 """Report attempt to download user page."""
1766 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1767 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1769 def _real_extract(self, url):
1771 mobj = re.match(self._VALID_URL, url)
1773 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1776 username = mobj.group(1)
1778 # Download video ids using YouTube Data API. Result size per
1779 # query is limited (currently to 50 videos) so we need to query
1780 # page by page until there are no video ids - it means we got
1787 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1788 self.report_download_page(username, start_index)
1790 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1793 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1798 # Extract video identifiers
1801 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1802 if mobj.group(1) not in ids_in_page:
1803 ids_in_page.append(mobj.group(1))
1805 video_ids.extend(ids_in_page)
1807 # A little optimization - if current page is not
1808 # "full", ie. does not contain PAGE_SIZE video ids then
1809 # we can assume that this page is the last one - there
1810 # are no more ids on further pages - no need to query
1813 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1818 all_ids_count = len(video_ids)
1819 playliststart = self._downloader.params.get('playliststart', 1) - 1
1820 playlistend = self._downloader.params.get('playlistend', -1)
1822 if playlistend == -1:
1823 video_ids = video_ids[playliststart:]
1825 video_ids = video_ids[playliststart:playlistend]
1827 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1828 (username, all_ids_count, len(video_ids)))
1830 for video_id in video_ids:
1831 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1834 class BlipTVUserIE(InfoExtractor):
1835 """Information Extractor for blip.tv users."""
1837 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1839 IE_NAME = u'blip.tv:user'
1841 def __init__(self, downloader=None):
1842 InfoExtractor.__init__(self, downloader)
1844 def report_download_page(self, username, pagenum):
1845 """Report attempt to download user page."""
1846 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1847 (self.IE_NAME, username, pagenum))
1849 def _real_extract(self, url):
1851 mobj = re.match(self._VALID_URL, url)
1853 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1856 username = mobj.group(1)
1858 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1860 request = compat_urllib_request.Request(url)
1863 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1864 mobj = re.search(r'data-users-id="([^"]+)"', page)
1865 page_base = page_base % mobj.group(1)
1866 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1867 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1871 # Download video ids using BlipTV Ajax calls. Result size per
1872 # query is limited (currently to 12 videos) so we need to query
1873 # page by page until there are no video ids - it means we got
1880 self.report_download_page(username, pagenum)
1882 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1885 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1886 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1887 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1890 # Extract video identifiers
1893 for mobj in re.finditer(r'href="/([^"]+)"', page):
1894 if mobj.group(1) not in ids_in_page:
1895 ids_in_page.append(unescapeHTML(mobj.group(1)))
1897 video_ids.extend(ids_in_page)
1899 # A little optimization - if current page is not
1900 # "full", ie. does not contain PAGE_SIZE video ids then
1901 # we can assume that this page is the last one - there
1902 # are no more ids on further pages - no need to query
1905 if len(ids_in_page) < self._PAGE_SIZE:
1910 all_ids_count = len(video_ids)
1911 playliststart = self._downloader.params.get('playliststart', 1) - 1
1912 playlistend = self._downloader.params.get('playlistend', -1)
1914 if playlistend == -1:
1915 video_ids = video_ids[playliststart:]
1917 video_ids = video_ids[playliststart:playlistend]
1919 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1920 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1922 for video_id in video_ids:
1923 self._downloader.download([u'http://blip.tv/'+video_id])
1926 class DepositFilesIE(InfoExtractor):
1927 """Information extractor for depositfiles.com"""
1929 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1931 def report_download_webpage(self, file_id):
1932 """Report webpage download."""
1933 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1935 def report_extraction(self, file_id):
1936 """Report information extraction."""
1937 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1939 def _real_extract(self, url):
1940 file_id = url.split('/')[-1]
1941 # Rebuild url in english locale
1942 url = 'http://depositfiles.com/en/files/' + file_id
1944 # Retrieve file webpage with 'Free download' button pressed
1945 free_download_indication = { 'gateway_result' : '1' }
1946 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1948 self.report_download_webpage(file_id)
1949 webpage = compat_urllib_request.urlopen(request).read()
1950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1951 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1954 # Search for the real file URL
1955 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1956 if (mobj is None) or (mobj.group(1) is None):
1957 # Try to figure out reason of the error.
1958 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1959 if (mobj is not None) and (mobj.group(1) is not None):
1960 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1961 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1963 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1966 file_url = mobj.group(1)
1967 file_extension = os.path.splitext(file_url)[1][1:]
1969 # Search for file title
1970 mobj = re.search(r'<b title="(.*?)">', webpage)
1972 self._downloader.trouble(u'ERROR: unable to extract title')
1974 file_title = mobj.group(1).decode('utf-8')
1977 'id': file_id.decode('utf-8'),
1978 'url': file_url.decode('utf-8'),
1980 'upload_date': None,
1981 'title': file_title,
1982 'ext': file_extension.decode('utf-8'),
1986 class FacebookIE(InfoExtractor):
1987 """Information Extractor for Facebook"""
1989 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1990 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1991 _NETRC_MACHINE = 'facebook'
1992 IE_NAME = u'facebook'
1994 def report_login(self):
1995 """Report attempt to log in."""
1996 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1998 def _real_initialize(self):
1999 if self._downloader is None:
2004 downloader_params = self._downloader.params
2006 # Attempt to use provided username and password or .netrc data
2007 if downloader_params.get('username', None) is not None:
2008 useremail = downloader_params['username']
2009 password = downloader_params['password']
2010 elif downloader_params.get('usenetrc', False):
2012 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2013 if info is not None:
2017 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2018 except (IOError, netrc.NetrcParseError) as err:
2019 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2022 if useremail is None:
2031 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2034 login_results = compat_urllib_request.urlopen(request).read()
2035 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2036 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2038 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2039 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2042 def _real_extract(self, url):
2043 mobj = re.match(self._VALID_URL, url)
2045 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2047 video_id = mobj.group('ID')
2049 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2050 webpage = self._download_webpage(url, video_id)
2052 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2053 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2054 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2056 raise ExtractorError(u'Cannot parse data')
2057 data = dict(json.loads(m.group(1)))
2058 params_raw = compat_urllib_parse.unquote(data['params'])
2059 params = json.loads(params_raw)
2060 video_url = params['hd_src']
2061 video_duration = int(params['video_duration'])
2063 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2065 raise ExtractorError(u'Cannot find title in webpage')
2066 video_title = unescapeHTML(m.group(1))
2070 'title': video_title,
2073 'duration': video_duration,
2074 'thumbnail': params['thumbnail_src'],
2079 class BlipTVIE(InfoExtractor):
2080 """Information extractor for blip.tv"""
2082 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2083 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2084 IE_NAME = u'blip.tv'
2086 def report_extraction(self, file_id):
2087 """Report information extraction."""
2088 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2090 def report_direct_download(self, title):
2091 """Report information extraction."""
2092 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2094 def _real_extract(self, url):
2095 mobj = re.match(self._VALID_URL, url)
2097 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2104 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2105 request = compat_urllib_request.Request(json_url)
2106 request.add_header('User-Agent', 'iTunes/10.6.1')
2107 self.report_extraction(mobj.group(1))
2110 urlh = compat_urllib_request.urlopen(request)
2111 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2112 basename = url.split('/')[-1]
2113 title,ext = os.path.splitext(basename)
2114 title = title.decode('UTF-8')
2115 ext = ext.replace('.', '')
2116 self.report_direct_download(title)
2121 'upload_date': None,
2126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2127 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2128 if info is None: # Regular URL
2130 json_code_bytes = urlh.read()
2131 json_code = json_code_bytes.decode('utf-8')
2132 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2133 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2137 json_data = json.loads(json_code)
2138 if 'Post' in json_data:
2139 data = json_data['Post']
2143 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2144 video_url = data['media']['url']
2145 umobj = re.match(self._URL_EXT, video_url)
2147 raise ValueError('Can not determine filename extension')
2148 ext = umobj.group(1)
2151 'id': data['item_id'],
2153 'uploader': data['display_name'],
2154 'upload_date': upload_date,
2155 'title': data['title'],
2157 'format': data['media']['mimeType'],
2158 'thumbnail': data['thumbnailUrl'],
2159 'description': data['description'],
2160 'player_url': data['embedUrl'],
2161 'user_agent': 'iTunes/10.6.1',
2163 except (ValueError,KeyError) as err:
2164 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2170 class MyVideoIE(InfoExtractor):
2171 """Information Extractor for myvideo.de."""
2173 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2174 IE_NAME = u'myvideo'
2176 def __init__(self, downloader=None):
2177 InfoExtractor.__init__(self, downloader)
2179 def report_extraction(self, video_id):
2180 """Report information extraction."""
2181 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2183 def _real_extract(self,url):
2184 mobj = re.match(self._VALID_URL, url)
2186 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2189 video_id = mobj.group(1)
2192 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2193 webpage = self._download_webpage(webpage_url, video_id)
2195 self.report_extraction(video_id)
2196 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2199 self._downloader.trouble(u'ERROR: unable to extract media URL')
2201 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2203 mobj = re.search('<title>([^<]+)</title>', webpage)
2205 self._downloader.trouble(u'ERROR: unable to extract title')
2208 video_title = mobj.group(1)
2214 'upload_date': None,
2215 'title': video_title,
2219 class ComedyCentralIE(InfoExtractor):
2220 """Information extractor for The Daily Show and Colbert Report """
2222 # urls can be abbreviations like :thedailyshow or :colbert
2223 # urls for episodes like:
2224 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2225 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2226 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2227 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2228 |(https?://)?(www\.)?
2229 (?P<showname>thedailyshow|colbertnation)\.com/
2230 (full-episodes/(?P<episode>.*)|
2232 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2233 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2236 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2238 _video_extensions = {
2246 _video_dimensions = {
2255 def suitable(self, url):
2256 """Receives a URL and returns True if suitable for this IE."""
2257 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2259 def report_extraction(self, episode_id):
2260 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2262 def report_config_download(self, episode_id, media_id):
2263 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2265 def report_index_download(self, episode_id):
2266 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2268 def _print_formats(self, formats):
2269 print('Available formats:')
2271 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2274 def _real_extract(self, url):
2275 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2277 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2280 if mobj.group('shortname'):
2281 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2282 url = u'http://www.thedailyshow.com/full-episodes/'
2284 url = u'http://www.colbertnation.com/full-episodes/'
2285 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2286 assert mobj is not None
2288 if mobj.group('clip'):
2289 if mobj.group('showname') == 'thedailyshow':
2290 epTitle = mobj.group('tdstitle')
2292 epTitle = mobj.group('cntitle')
2295 dlNewest = not mobj.group('episode')
2297 epTitle = mobj.group('showname')
2299 epTitle = mobj.group('episode')
2301 req = compat_urllib_request.Request(url)
2302 self.report_extraction(epTitle)
2304 htmlHandle = compat_urllib_request.urlopen(req)
2305 html = htmlHandle.read()
2306 webpage = html.decode('utf-8')
2307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2308 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2311 url = htmlHandle.geturl()
2312 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2314 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2316 if mobj.group('episode') == '':
2317 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2319 epTitle = mobj.group('episode')
2321 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2323 if len(mMovieParams) == 0:
2324 # The Colbert Report embeds the information in a without
2325 # a URL prefix; so extract the alternate reference
2326 # and then add the URL prefix manually.
2328 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2329 if len(altMovieParams) == 0:
2330 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2333 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2335 uri = mMovieParams[0][1]
2336 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2337 self.report_index_download(epTitle)
2339 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2340 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2341 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2346 idoc = xml.etree.ElementTree.fromstring(indexXml)
2347 itemEls = idoc.findall('.//item')
2348 for partNum,itemEl in enumerate(itemEls):
2349 mediaId = itemEl.findall('./guid')[0].text
2350 shortMediaId = mediaId.split(':')[-1]
2351 showId = mediaId.split(':')[-2].replace('.com', '')
2352 officialTitle = itemEl.findall('./title')[0].text
2353 officialDate = itemEl.findall('./pubDate')[0].text
2355 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2356 compat_urllib_parse.urlencode({'uri': mediaId}))
2357 configReq = compat_urllib_request.Request(configUrl)
2358 self.report_config_download(epTitle, shortMediaId)
2360 configXml = compat_urllib_request.urlopen(configReq).read()
2361 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2362 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2365 cdoc = xml.etree.ElementTree.fromstring(configXml)
2367 for rendition in cdoc.findall('.//rendition'):
2368 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2372 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2375 if self._downloader.params.get('listformats', None):
2376 self._print_formats([i[0] for i in turls])
2379 # For now, just pick the highest bitrate
2380 format,rtmp_video_url = turls[-1]
2382 # Get the format arg from the arg stream
2383 req_format = self._downloader.params.get('format', None)
2385 # Select format if we can find one
2388 format, rtmp_video_url = f, v
2391 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2393 raise ExtractorError(u'Cannot transform RTMP url')
2394 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2395 video_url = base + m.group('finalid')
2397 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2402 'upload_date': officialDate,
2407 'description': officialTitle,
2409 results.append(info)
2414 class EscapistIE(InfoExtractor):
2415 """Information extractor for The Escapist """
2417 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2418 IE_NAME = u'escapist'
2420 def report_extraction(self, showName):
2421 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2423 def report_config_download(self, showName):
2424 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2426 def _real_extract(self, url):
2427 mobj = re.match(self._VALID_URL, url)
2429 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2431 showName = mobj.group('showname')
2432 videoId = mobj.group('episode')
2434 self.report_extraction(showName)
2436 webPage = compat_urllib_request.urlopen(url)
2437 webPageBytes = webPage.read()
2438 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2439 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2444 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2445 description = unescapeHTML(descMatch.group(1))
2446 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2447 imgUrl = unescapeHTML(imgMatch.group(1))
2448 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2449 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2450 configUrlMatch = re.search('config=(.*)$', playerUrl)
2451 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2453 self.report_config_download(showName)
2455 configJSON = compat_urllib_request.urlopen(configUrl)
2456 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2457 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2458 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2459 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2462 # Technically, it's JavaScript, not JSON
2463 configJSON = configJSON.replace("'", '"')
2466 config = json.loads(configJSON)
2467 except (ValueError,) as err:
2468 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2471 playlist = config['playlist']
2472 videoUrl = playlist[1]['url']
2477 'uploader': showName,
2478 'upload_date': None,
2481 'thumbnail': imgUrl,
2482 'description': description,
2483 'player_url': playerUrl,
2488 class CollegeHumorIE(InfoExtractor):
2489 """Information extractor for collegehumor.com"""
2492 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2493 IE_NAME = u'collegehumor'
2495 def report_manifest(self, video_id):
2496 """Report information extraction."""
2497 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2499 def report_extraction(self, video_id):
2500 """Report information extraction."""
2501 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2503 def _real_extract(self, url):
2504 mobj = re.match(self._VALID_URL, url)
2506 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2508 video_id = mobj.group('videoid')
2513 'upload_date': None,
2516 self.report_extraction(video_id)
2517 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2519 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2520 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2521 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2524 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2526 videoNode = mdoc.findall('./video')[0]
2527 info['description'] = videoNode.findall('./description')[0].text
2528 info['title'] = videoNode.findall('./caption')[0].text
2529 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2530 manifest_url = videoNode.findall('./file')[0].text
2532 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2535 manifest_url += '?hdcore=2.10.3'
2536 self.report_manifest(video_id)
2538 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2539 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2540 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2543 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2545 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2546 node_id = media_node.attrib['url']
2547 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2548 except IndexError as err:
2549 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2552 url_pr = compat_urllib_parse_urlparse(manifest_url)
2553 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2560 class XVideosIE(InfoExtractor):
2561 """Information extractor for xvideos.com"""
2563 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2564 IE_NAME = u'xvideos'
2566 def report_extraction(self, video_id):
2567 """Report information extraction."""
2568 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2570 def _real_extract(self, url):
2571 mobj = re.match(self._VALID_URL, url)
2573 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2575 video_id = mobj.group(1)
2577 webpage = self._download_webpage(url, video_id)
2579 self.report_extraction(video_id)
2583 mobj = re.search(r'flv_url=(.+?)&', webpage)
2585 self._downloader.trouble(u'ERROR: unable to extract video url')
2587 video_url = compat_urllib_parse.unquote(mobj.group(1))
2591 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2593 self._downloader.trouble(u'ERROR: unable to extract video title')
2595 video_title = mobj.group(1)
2598 # Extract video thumbnail
2599 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2601 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2603 video_thumbnail = mobj.group(0)
2609 'upload_date': None,
2610 'title': video_title,
2612 'thumbnail': video_thumbnail,
2613 'description': None,
2619 class SoundcloudIE(InfoExtractor):
2620 """Information extractor for soundcloud.com
2621 To access the media, the uid of the song and a stream token
2622 must be extracted from the page source and the script must make
2623 a request to media.soundcloud.com/crossdomain.xml. Then
2624 the media can be grabbed by requesting from an url composed
2625 of the stream token and uid
2628 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2629 IE_NAME = u'soundcloud'
2631 def __init__(self, downloader=None):
2632 InfoExtractor.__init__(self, downloader)
2634 def report_resolve(self, video_id):
2635 """Report information extraction."""
2636 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2638 def report_extraction(self, video_id):
2639 """Report information extraction."""
2640 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2642 def _real_extract(self, url):
2643 mobj = re.match(self._VALID_URL, url)
2645 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2648 # extract uploader (which is in the url)
2649 uploader = mobj.group(1)
2650 # extract simple title (uploader + slug of song title)
2651 slug_title = mobj.group(2)
2652 simple_title = uploader + u'-' + slug_title
2654 self.report_resolve('%s/%s' % (uploader, slug_title))
2656 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2657 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2658 request = compat_urllib_request.Request(resolv_url)
2660 info_json_bytes = compat_urllib_request.urlopen(request).read()
2661 info_json = info_json_bytes.decode('utf-8')
2662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2666 info = json.loads(info_json)
2667 video_id = info['id']
2668 self.report_extraction('%s/%s' % (uploader, slug_title))
2670 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2671 request = compat_urllib_request.Request(streams_url)
2673 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2674 stream_json = stream_json_bytes.decode('utf-8')
2675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2679 streams = json.loads(stream_json)
2680 mediaURL = streams['http_mp3_128_url']
2685 'uploader': info['user']['username'],
2686 'upload_date': info['created_at'],
2687 'title': info['title'],
2689 'description': info['description'],
2693 class InfoQIE(InfoExtractor):
2694 """Information extractor for infoq.com"""
2695 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2697 def report_extraction(self, video_id):
2698 """Report information extraction."""
2699 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2701 def _real_extract(self, url):
2702 mobj = re.match(self._VALID_URL, url)
2704 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2707 webpage = self._download_webpage(url, video_id=url)
2708 self.report_extraction(url)
2711 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2713 self._downloader.trouble(u'ERROR: unable to extract video url')
2715 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2716 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2719 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2721 self._downloader.trouble(u'ERROR: unable to extract video title')
2723 video_title = mobj.group(1)
2725 # Extract description
2726 video_description = u'No description available.'
2727 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2728 if mobj is not None:
2729 video_description = mobj.group(1)
2731 video_filename = video_url.split('/')[-1]
2732 video_id, extension = video_filename.split('.')
2738 'upload_date': None,
2739 'title': video_title,
2740 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2742 'description': video_description,
2747 class MixcloudIE(InfoExtractor):
2748 """Information extractor for www.mixcloud.com"""
2750 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2751 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752 IE_NAME = u'mixcloud'
2754 def __init__(self, downloader=None):
2755 InfoExtractor.__init__(self, downloader)
2757 def report_download_json(self, file_id):
2758 """Report JSON download."""
2759 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2761 def report_extraction(self, file_id):
2762 """Report information extraction."""
2763 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2765 def get_urls(self, jsonData, fmt, bitrate='best'):
2766 """Get urls from 'audio_formats' section in json"""
2769 bitrate_list = jsonData[fmt]
2770 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2771 bitrate = max(bitrate_list) # select highest
2773 url_list = jsonData[fmt][bitrate]
2774 except TypeError: # we have no bitrate info.
2775 url_list = jsonData[fmt]
2778 def check_urls(self, url_list):
2779 """Returns 1st active url from list"""
2780 for url in url_list:
2782 compat_urllib_request.urlopen(url)
2784 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2789 def _print_formats(self, formats):
2790 print('Available formats:')
2791 for fmt in formats.keys():
2792 for b in formats[fmt]:
2794 ext = formats[fmt][b][0]
2795 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2796 except TypeError: # we have no bitrate info
2797 ext = formats[fmt][0]
2798 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2801 def _real_extract(self, url):
2802 mobj = re.match(self._VALID_URL, url)
2804 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2806 # extract uploader & filename from url
2807 uploader = mobj.group(1).decode('utf-8')
2808 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2810 # construct API request
2811 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2812 # retrieve .json file with links to files
2813 request = compat_urllib_request.Request(file_url)
2815 self.report_download_json(file_url)
2816 jsonData = compat_urllib_request.urlopen(request).read()
2817 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2818 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2822 json_data = json.loads(jsonData)
2823 player_url = json_data['player_swf_url']
2824 formats = dict(json_data['audio_formats'])
2826 req_format = self._downloader.params.get('format', None)
2829 if self._downloader.params.get('listformats', None):
2830 self._print_formats(formats)
2833 if req_format is None or req_format == 'best':
2834 for format_param in formats.keys():
2835 url_list = self.get_urls(formats, format_param)
2837 file_url = self.check_urls(url_list)
2838 if file_url is not None:
2841 if req_format not in formats:
2842 self._downloader.trouble(u'ERROR: format is not available')
2845 url_list = self.get_urls(formats, req_format)
2846 file_url = self.check_urls(url_list)
2847 format_param = req_format
2850 'id': file_id.decode('utf-8'),
2851 'url': file_url.decode('utf-8'),
2852 'uploader': uploader.decode('utf-8'),
2853 'upload_date': None,
2854 'title': json_data['name'],
2855 'ext': file_url.split('.')[-1].decode('utf-8'),
2856 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2857 'thumbnail': json_data['thumbnail_url'],
2858 'description': json_data['description'],
2859 'player_url': player_url.decode('utf-8'),
2862 class StanfordOpenClassroomIE(InfoExtractor):
2863 """Information extractor for Stanford's Open ClassRoom"""
2865 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2866 IE_NAME = u'stanfordoc'
2868 def report_download_webpage(self, objid):
2869 """Report information extraction."""
2870 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2872 def report_extraction(self, video_id):
2873 """Report information extraction."""
2874 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2876 def _real_extract(self, url):
2877 mobj = re.match(self._VALID_URL, url)
2879 raise ExtractorError(u'Invalid URL: %s' % url)
2881 if mobj.group('course') and mobj.group('video'): # A specific video
2882 course = mobj.group('course')
2883 video = mobj.group('video')
2885 'id': course + '_' + video,
2887 'upload_date': None,
2890 self.report_extraction(info['id'])
2891 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2892 xmlUrl = baseUrl + video + '.xml'
2894 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2898 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2900 info['title'] = mdoc.findall('./title')[0].text
2901 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2903 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2905 info['ext'] = info['url'].rpartition('.')[2]
2907 elif mobj.group('course'): # A course page
2908 course = mobj.group('course')
2913 'upload_date': None,
2916 coursepage = self._download_webpage(url, info['id'],
2917 note='Downloading course info page',
2918 errnote='Unable to download course info page')
2920 m = re.search('<h1>([^<]+)</h1>', coursepage)
2922 info['title'] = unescapeHTML(m.group(1))
2924 info['title'] = info['id']
2926 m = re.search('<description>([^<]+)</description>', coursepage)
2928 info['description'] = unescapeHTML(m.group(1))
2930 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2933 'type': 'reference',
2934 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2938 for entry in info['list']:
2939 assert entry['type'] == 'reference'
2940 results += self.extract(entry['url'])
2944 'id': 'Stanford OpenClassroom',
2947 'upload_date': None,
2950 self.report_download_webpage(info['id'])
2951 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2953 rootpage = compat_urllib_request.urlopen(rootURL).read()
2954 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2955 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2958 info['title'] = info['id']
2960 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2963 'type': 'reference',
2964 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2969 for entry in info['list']:
2970 assert entry['type'] == 'reference'
2971 results += self.extract(entry['url'])
2974 class MTVIE(InfoExtractor):
2975 """Information extractor for MTV.com"""
2977 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2980 def report_extraction(self, video_id):
2981 """Report information extraction."""
2982 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2984 def _real_extract(self, url):
2985 mobj = re.match(self._VALID_URL, url)
2987 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2989 if not mobj.group('proto'):
2990 url = 'http://' + url
2991 video_id = mobj.group('videoid')
2993 webpage = self._download_webpage(url, video_id)
2995 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2997 self._downloader.trouble(u'ERROR: unable to extract song name')
2999 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3000 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3002 self._downloader.trouble(u'ERROR: unable to extract performer')
3004 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3005 video_title = performer + ' - ' + song_name
3007 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3009 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3011 mtvn_uri = mobj.group(1)
3013 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3015 self._downloader.trouble(u'ERROR: unable to extract content id')
3017 content_id = mobj.group(1)
3019 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3020 self.report_extraction(video_id)
3021 request = compat_urllib_request.Request(videogen_url)
3023 metadataXml = compat_urllib_request.urlopen(request).read()
3024 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3028 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3029 renditions = mdoc.findall('.//rendition')
3031 # For now, always pick the highest quality.
3032 rendition = renditions[-1]
3035 _,_,ext = rendition.attrib['type'].partition('/')
3036 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3037 video_url = rendition.find('./src').text
3039 self._downloader.trouble('Invalid rendition field.')
3045 'uploader': performer,
3046 'upload_date': None,
3047 'title': video_title,
3055 class YoukuIE(InfoExtractor):
3056 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3058 def report_download_webpage(self, file_id):
3059 """Report webpage download."""
3060 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3062 def report_extraction(self, file_id):
3063 """Report information extraction."""
3064 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3067 nowTime = int(time.time() * 1000)
3068 random1 = random.randint(1000,1998)
3069 random2 = random.randint(1000,9999)
3071 return "%d%d%d" %(nowTime,random1,random2)
3073 def _get_file_ID_mix_string(self, seed):
3075 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3077 for i in range(len(source)):
3078 seed = (seed * 211 + 30031 ) % 65536
3079 index = math.floor(seed / 65536 * len(source) )
3080 mixed.append(source[int(index)])
3081 source.remove(source[int(index)])
3082 #return ''.join(mixed)
3085 def _get_file_id(self, fileId, seed):
3086 mixed = self._get_file_ID_mix_string(seed)
3087 ids = fileId.split('*')
3091 realId.append(mixed[int(ch)])
3092 return ''.join(realId)
3094 def _real_extract(self, url):
3095 mobj = re.match(self._VALID_URL, url)
3097 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3099 video_id = mobj.group('ID')
3101 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3103 request = compat_urllib_request.Request(info_url, None, std_headers)
3105 self.report_download_webpage(video_id)
3106 jsondata = compat_urllib_request.urlopen(request).read()
3107 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3108 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3111 self.report_extraction(video_id)
3113 jsonstr = jsondata.decode('utf-8')
3114 config = json.loads(jsonstr)
3116 video_title = config['data'][0]['title']
3117 seed = config['data'][0]['seed']
3119 format = self._downloader.params.get('format', None)
3120 supported_format = list(config['data'][0]['streamfileids'].keys())
3122 if format is None or format == 'best':
3123 if 'hd2' in supported_format:
3128 elif format == 'worst':
3136 fileid = config['data'][0]['streamfileids'][format]
3137 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3138 except (UnicodeDecodeError, ValueError, KeyError):
3139 self._downloader.trouble(u'ERROR: unable to extract info section')
3143 sid = self._gen_sid()
3144 fileid = self._get_file_id(fileid, seed)
3146 #column 8,9 of fileid represent the segment number
3147 #fileid[7:9] should be changed
3148 for index, key in enumerate(keys):
3150 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3151 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3154 'id': '%s_part%02d' % (video_id, index),
3155 'url': download_url,
3157 'upload_date': None,
3158 'title': video_title,
3161 files_info.append(info)
3166 class XNXXIE(InfoExtractor):
3167 """Information extractor for xnxx.com"""
3169 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3171 VIDEO_URL_RE = r'flv_url=(.*?)&'
3172 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3173 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3175 def report_webpage(self, video_id):
3176 """Report information extraction"""
3177 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3179 def report_extraction(self, video_id):
3180 """Report information extraction"""
3181 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3183 def _real_extract(self, url):
3184 mobj = re.match(self._VALID_URL, url)
3186 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3188 video_id = mobj.group(1)
3190 self.report_webpage(video_id)
3192 # Get webpage content
3194 webpage_bytes = compat_urllib_request.urlopen(url).read()
3195 webpage = webpage_bytes.decode('utf-8')
3196 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3197 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3200 result = re.search(self.VIDEO_URL_RE, webpage)
3202 self._downloader.trouble(u'ERROR: unable to extract video url')
3204 video_url = compat_urllib_parse.unquote(result.group(1))
3206 result = re.search(self.VIDEO_TITLE_RE, webpage)
3208 self._downloader.trouble(u'ERROR: unable to extract video title')
3210 video_title = result.group(1)
3212 result = re.search(self.VIDEO_THUMB_RE, webpage)
3214 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3216 video_thumbnail = result.group(1)
3222 'upload_date': None,
3223 'title': video_title,
3225 'thumbnail': video_thumbnail,
3226 'description': None,
3230 class GooglePlusIE(InfoExtractor):
3231 """Information extractor for plus.google.com."""
3233 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3234 IE_NAME = u'plus.google'
3236 def __init__(self, downloader=None):
3237 InfoExtractor.__init__(self, downloader)
3239 def report_extract_entry(self, url):
3240 """Report downloading extry"""
3241 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3243 def report_date(self, upload_date):
3244 """Report downloading extry"""
3245 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3247 def report_uploader(self, uploader):
3248 """Report downloading extry"""
3249 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3251 def report_title(self, video_title):
3252 """Report downloading extry"""
3253 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3255 def report_extract_vid_page(self, video_page):
3256 """Report information extraction."""
3257 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3259 def _real_extract(self, url):
3260 # Extract id from URL
3261 mobj = re.match(self._VALID_URL, url)
3263 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3266 post_url = mobj.group(0)
3267 video_id = mobj.group(1)
3269 video_extension = 'flv'
3271 # Step 1, Retrieve post webpage to extract further information
3272 self.report_extract_entry(post_url)
3273 request = compat_urllib_request.Request(post_url)
3275 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3277 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3280 # Extract update date
3282 pattern = 'title="Timestamp">(.*?)</a>'
3283 mobj = re.search(pattern, webpage)
3285 upload_date = mobj.group(1)
3286 # Convert timestring to a format suitable for filename
3287 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3288 upload_date = upload_date.strftime('%Y%m%d')
3289 self.report_date(upload_date)
3293 pattern = r'rel\="author".*?>(.*?)</a>'
3294 mobj = re.search(pattern, webpage)
3296 uploader = mobj.group(1)
3297 self.report_uploader(uploader)
3300 # Get the first line for title
3302 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3303 mobj = re.search(pattern, webpage)
3305 video_title = mobj.group(1)
3306 self.report_title(video_title)
3308 # Step 2, Stimulate clicking the image box to launch video
3309 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3310 mobj = re.search(pattern, webpage)
3312 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3314 video_page = mobj.group(1)
3315 request = compat_urllib_request.Request(video_page)
3317 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3318 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3321 self.report_extract_vid_page(video_page)
3324 # Extract video links on video page
3325 """Extract video links of all sizes"""
3326 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3327 mobj = re.findall(pattern, webpage)
3329 self._downloader.trouble(u'ERROR: unable to extract video links')
3331 # Sort in resolution
3332 links = sorted(mobj)
3334 # Choose the lowest of the sort, i.e. highest resolution
3335 video_url = links[-1]
3336 # Only get the url. The resolution part in the tuple has no use anymore
3337 video_url = video_url[-1]
3338 # Treat escaped \u0026 style hex
3340 video_url = video_url.decode("unicode_escape")
3341 except AttributeError: # Python 3
3342 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3348 'uploader': uploader,
3349 'upload_date': upload_date,
3350 'title': video_title,
3351 'ext': video_extension,
3354 class NBAIE(InfoExtractor):
3355 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3358 def _real_extract(self, url):
3359 mobj = re.match(self._VALID_URL, url)
3361 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3364 video_id = mobj.group(1)
3365 if video_id.endswith('/index.html'):
3366 video_id = video_id[:-len('/index.html')]
3368 webpage = self._download_webpage(url, video_id)
3370 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3371 def _findProp(rexp, default=None):
3372 m = re.search(rexp, webpage)
3374 return unescapeHTML(m.group(1))
3378 shortened_video_id = video_id.rpartition('/')[2]
3379 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3381 'id': shortened_video_id,
3385 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3386 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3390 class JustinTVIE(InfoExtractor):
3391 """Information extractor for justin.tv and twitch.tv"""
3392 # TODO: One broadcast may be split into multiple videos. The key
3393 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3394 # starts at 1 and increases. Can we treat all parts as one video?
3396 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3397 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3398 _JUSTIN_PAGE_LIMIT = 100
3399 IE_NAME = u'justin.tv'
3401 def report_extraction(self, file_id):
3402 """Report information extraction."""
3403 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3405 def report_download_page(self, channel, offset):
3406 """Report attempt to download a single page of videos."""
3407 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3408 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3410 # Return count of items, list of *valid* items
3411 def _parse_page(self, url):
3413 urlh = compat_urllib_request.urlopen(url)
3414 webpage_bytes = urlh.read()
3415 webpage = webpage_bytes.decode('utf-8', 'ignore')
3416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3417 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3420 response = json.loads(webpage)
3421 if type(response) != list:
3422 error_text = response.get('error', 'unknown error')
3423 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3426 for clip in response:
3427 video_url = clip['video_file_url']
3429 video_extension = os.path.splitext(video_url)[1][1:]
3430 video_date = re.sub('-', '', clip['start_time'][:10])
3431 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3432 video_id = clip['id']
3433 video_title = clip.get('title', video_id)
3437 'title': video_title,
3438 'uploader': clip.get('channel_name', video_uploader_id),
3439 'uploader_id': video_uploader_id,
3440 'upload_date': video_date,
3441 'ext': video_extension,
3443 return (len(response), info)
3445 def _real_extract(self, url):
3446 mobj = re.match(self._VALID_URL, url)
3448 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3451 api = 'http://api.justin.tv'
3452 video_id = mobj.group(mobj.lastindex)
3454 if mobj.lastindex == 1:
3456 api += '/channel/archives/%s.json'
3458 api += '/broadcast/by_archive/%s.json'
3459 api = api % (video_id,)
3461 self.report_extraction(video_id)
3465 limit = self._JUSTIN_PAGE_LIMIT
3468 self.report_download_page(video_id, offset)
3469 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3470 page_count, page_info = self._parse_page(page_url)
3471 info.extend(page_info)
3472 if not paged or page_count != limit:
3477 class FunnyOrDieIE(InfoExtractor):
3478 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3480 def _real_extract(self, url):
3481 mobj = re.match(self._VALID_URL, url)
3483 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3486 video_id = mobj.group('id')
3487 webpage = self._download_webpage(url, video_id)
3489 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3491 self._downloader.trouble(u'ERROR: unable to find video information')
3492 video_url = unescapeHTML(m.group('url'))
3494 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3496 self._downloader.trouble(u'Cannot find video title')
3497 title = unescapeHTML(m.group('title'))
3499 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3501 desc = unescapeHTML(m.group('desc'))
3510 'description': desc,
3514 class TweetReelIE(InfoExtractor):
3515 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3517 def _real_extract(self, url):
3518 mobj = re.match(self._VALID_URL, url)
3520 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3523 video_id = mobj.group('id')
3524 webpage = self._download_webpage(url, video_id)
3526 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3528 self._downloader.trouble(u'ERROR: Cannot find status ID')
3529 status_id = m.group(1)
3531 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3533 self._downloader.trouble(u'WARNING: Cannot find description')
3534 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3536 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3538 self._downloader.trouble(u'ERROR: Cannot find uploader')
3539 uploader = unescapeHTML(m.group('uploader'))
3540 uploader_id = unescapeHTML(m.group('uploader_id'))
3542 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3544 self._downloader.trouble(u'ERROR: Cannot find upload date')
3545 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3548 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3555 'description': desc,
3556 'uploader': uploader,
3557 'uploader_id': uploader_id,
3558 'internal_id': status_id,
3559 'upload_date': upload_date
3563 class SteamIE(InfoExtractor):
3564 _VALID_URL = r"""http://store.steampowered.com/
3565 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3567 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3570 def suitable(self, url):
3571 """Receives a URL and returns True if suitable for this IE."""
3572 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3574 def _real_extract(self, url):
3575 m = re.match(self._VALID_URL, url, re.VERBOSE)
3576 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3577 gameID = m.group('gameID')
3578 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3579 webpage = self._download_webpage(videourl, gameID)
3580 mweb = re.finditer(urlRE, webpage)
3581 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3582 titles = re.finditer(namesRE, webpage)
3584 for vid,vtitle in zip(mweb,titles):
3585 video_id = vid.group('videoID')
3586 title = vtitle.group('videoName')
3587 video_url = vid.group('videoURL')
3589 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3594 'title': unescapeHTML(title)
3599 class UstreamIE(InfoExtractor):
3600 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3601 IE_NAME = u'ustream'
3603 def _real_extract(self, url):
3604 m = re.match(self._VALID_URL, url)
3605 video_id = m.group('videoID')
3606 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3607 webpage = self._download_webpage(url, video_id)
3608 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3609 title = m.group('title')
3610 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3611 uploader = m.group('uploader')
3617 'uploader': uploader
3621 class RBMARadioIE(InfoExtractor):
3622 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3624 def _real_extract(self, url):
3625 m = re.match(self._VALID_URL, url)
3626 video_id = m.group('videoID')
3628 webpage = self._download_webpage(url, video_id)
3629 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3631 raise ExtractorError(u'Cannot find metadata')
3632 json_data = m.group(1)
3635 data = json.loads(json_data)
3636 except ValueError as e:
3637 raise ExtractorError(u'Invalid JSON: ' + str(e))
3639 video_url = data['akamai_url'] + '&cbr=256'
3640 url_parts = compat_urllib_parse_urlparse(video_url)
3641 video_ext = url_parts.path.rpartition('.')[2]
3646 'title': data['title'],
3647 'description': data.get('teaser_text'),
3648 'location': data.get('country_of_origin'),
3649 'uploader': data.get('host', {}).get('name'),
3650 'uploader_id': data.get('host', {}).get('slug'),
3651 'thumbnail': data.get('image', {}).get('large_url_2x'),
3652 'duration': data.get('duration'),
3657 class YouPornIE(InfoExtractor):
3658 """Information extractor for youporn.com."""
3659 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3661 def _print_formats(self, formats):
3662 """Print all available formats"""
3663 print(u'Available formats:')
3664 print(u'ext\t\tformat')
3665 print(u'---------------------------------')
3666 for format in formats:
3667 print(u'%s\t\t%s' % (format['ext'], format['format']))
3669 def _specific(self, req_format, formats):
3671 if(x["format"]==req_format):
3675 def _real_extract(self, url):
3676 mobj = re.match(self._VALID_URL, url)
3678 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3681 video_id = mobj.group('videoid')
3683 req = compat_urllib_request.Request(url)
3684 req.add_header('Cookie', 'age_verified=1')
3685 webpage = self._download_webpage(req, video_id)
3687 # Get the video title
3688 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3690 raise ExtractorError(u'ERROR: unable to extract video title')
3691 video_title = result.group('title').strip()
3693 # Get the video date
3694 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3696 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3699 upload_date = result.group('date').strip()
3701 # Get the video uploader
3702 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3704 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3705 video_uploader = None
3707 video_uploader = result.group('uploader').strip()
3708 video_uploader = clean_html( video_uploader )
3710 # Get all of the formats available
3711 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3712 result = re.search(DOWNLOAD_LIST_RE, webpage)
3714 raise ExtractorError(u'Unable to extract download list')
3715 download_list_html = result.group('download_list').strip()
3717 # Get all of the links from the page
3718 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3719 links = re.findall(LINK_RE, download_list_html)
3720 if(len(links) == 0):
3721 raise ExtractorError(u'ERROR: no known formats available for video')
3723 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3728 # A link looks like this:
3729 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3730 # A path looks like this:
3731 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3732 video_url = unescapeHTML( link )
3733 path = compat_urllib_parse_urlparse( video_url ).path
3734 extension = os.path.splitext( path )[1][1:]
3735 format = path.split('/')[4].split('_')[:2]
3738 format = "-".join( format )
3739 title = u'%s-%s-%s' % (video_title, size, bitrate)
3744 'uploader': video_uploader,
3745 'upload_date': upload_date,
3750 'description': None,
3754 if self._downloader.params.get('listformats', None):
3755 self._print_formats(formats)
3758 req_format = self._downloader.params.get('format', None)
3759 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3761 if req_format is None or req_format == 'best':
3763 elif req_format == 'worst':
3764 return [formats[-1]]
3765 elif req_format in ('-1', 'all'):
3768 format = self._specific( req_format, formats )
3770 self._downloader.trouble(u'ERROR: requested format not available')
3776 class PornotubeIE(InfoExtractor):
3777 """Information extractor for pornotube.com."""
3778 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3780 def _real_extract(self, url):
3781 mobj = re.match(self._VALID_URL, url)
3783 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3786 video_id = mobj.group('videoid')
3787 video_title = mobj.group('title')
3789 # Get webpage content
3790 webpage = self._download_webpage(url, video_id)
3793 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3794 result = re.search(VIDEO_URL_RE, webpage)
3796 self._downloader.trouble(u'ERROR: unable to extract video url')
3798 video_url = compat_urllib_parse.unquote(result.group('url'))
3800 #Get the uploaded date
3801 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3802 result = re.search(VIDEO_UPLOADED_RE, webpage)
3804 self._downloader.trouble(u'ERROR: unable to extract video title')
3806 upload_date = result.group('date')
3808 info = {'id': video_id,
3811 'upload_date': upload_date,
3812 'title': video_title,
3818 class YouJizzIE(InfoExtractor):
3819 """Information extractor for youjizz.com."""
3820 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3822 def _real_extract(self, url):
3823 mobj = re.match(self._VALID_URL, url)
3825 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3828 video_id = mobj.group('videoid')
3830 # Get webpage content
3831 webpage = self._download_webpage(url, video_id)
3833 # Get the video title
3834 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3836 raise ExtractorError(u'ERROR: unable to extract video title')
3837 video_title = result.group('title').strip()
3839 # Get the embed page
3840 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3842 raise ExtractorError(u'ERROR: unable to extract embed page')
3844 embed_page_url = result.group(0).strip()
3845 video_id = result.group('videoid')
3847 webpage = self._download_webpage(embed_page_url, video_id)
3850 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3852 raise ExtractorError(u'ERROR: unable to extract video url')
3853 video_url = result.group('source')
3855 info = {'id': video_id,
3857 'title': video_title,
3860 'player_url': embed_page_url}
3864 class EightTracksIE(InfoExtractor):
3866 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3868 def _real_extract(self, url):
3869 mobj = re.match(self._VALID_URL, url)
3871 raise ExtractorError(u'Invalid URL: %s' % url)
3872 playlist_id = mobj.group('id')
3874 webpage = self._download_webpage(url, playlist_id)
3876 m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3878 raise ExtractorError(u'Cannot find trax information')
3879 json_like = m.group(1)
3880 data = json.loads(json_like)
3882 session = str(random.randint(0, 1000000000))
3884 track_count = data['tracks_count']
3885 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3886 next_url = first_url
3888 for i in itertools.count():
3889 api_json = self._download_webpage(next_url, playlist_id,
3890 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3891 errnote=u'Failed to download song information')
3892 api_data = json.loads(api_json)
3893 track_data = api_data[u'set']['track']
3895 'id': track_data['id'],
3896 'url': track_data['track_file_stream_url'],
3897 'title': track_data['performer'] + u' - ' + track_data['name'],
3898 'raw_title': track_data['name'],
3899 'uploader_id': data['user']['login'],
3903 if api_data['set']['at_last_track']:
3905 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3908 def gen_extractors():
3909 """ Return a list of an instance of every supported extractor.
3910 The order does matter; the first extractor matched is the one handling the URL.
3913 YoutubePlaylistIE(),
3937 StanfordOpenClassroomIE(),