2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 from .extractor.common import InfoExtractor, SearchInfoExtractor
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.metacafe import MetacafeIE
29 from .extractor.statigram import StatigramIE
30 from .extractor.photobucket import PhotobucketIE
31 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
39 class YahooIE(InfoExtractor):
40 """Information extractor for screen.yahoo.com."""
41 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
43 def _real_extract(self, url):
44 mobj = re.match(self._VALID_URL, url)
46 raise ExtractorError(u'Invalid URL: %s' % url)
47 video_id = mobj.group('id')
48 webpage = self._download_webpage(url, video_id)
49 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
52 # TODO: Check which url parameters are required
53 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
54 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
55 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
56 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
57 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
58 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
60 self.report_extraction(video_id)
61 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
63 raise ExtractorError(u'Unable to extract video info')
64 video_title = m_info.group('title')
65 video_description = m_info.group('description')
66 video_thumb = m_info.group('thumb')
67 video_date = m_info.group('date')
68 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
70 # TODO: Find a way to get mp4 videos
71 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
72 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
73 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
74 video_url = m_rest.group('url')
75 video_path = m_rest.group('path')
77 raise ExtractorError(u'Unable to extract video url')
79 else: # We have to use a different method if another id is defined
80 long_id = m_id.group('new_id')
81 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
82 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
83 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
84 info = json.loads(json_str)
85 res = info[u'query'][u'results'][u'mediaObj'][0]
86 stream = res[u'streams'][0]
87 video_path = stream[u'path']
88 video_url = stream[u'host']
90 video_title = meta[u'title']
91 video_description = meta[u'description']
92 video_thumb = meta[u'thumbnail']
93 video_date = None # I can't find it
98 'play_path': video_path,
100 'description': video_description,
101 'thumbnail': video_thumb,
102 'upload_date': video_date,
107 class VimeoIE(InfoExtractor):
108 """Information extractor for vimeo.com."""
110 # _VALID_URL matches Vimeo URLs
111 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
114 def _verify_video_password(self, url, video_id, webpage):
115 password = self._downloader.params.get('password', None)
117 raise ExtractorError(u'This video is protected by a password, use the --password option')
118 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
119 data = compat_urllib_parse.urlencode({'password': password,
121 # I didn't manage to use the password with https
122 if url.startswith('https'):
123 pass_url = url.replace('https','http')
126 password_request = compat_urllib_request.Request(pass_url+'/password', data)
127 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
128 password_request.add_header('Cookie', 'xsrft=%s' % token)
129 pass_web = self._download_webpage(password_request, video_id,
130 u'Verifying the password',
133 def _real_extract(self, url, new_video=True):
134 # Extract ID from URL
135 mobj = re.match(self._VALID_URL, url)
137 raise ExtractorError(u'Invalid URL: %s' % url)
139 video_id = mobj.group('id')
140 if not mobj.group('proto'):
141 url = 'https://' + url
142 if mobj.group('direct_link') or mobj.group('pro'):
143 url = 'https://vimeo.com/' + video_id
145 # Retrieve video webpage to extract further information
146 request = compat_urllib_request.Request(url, None, std_headers)
147 webpage = self._download_webpage(request, video_id)
149 # Now we begin extracting as much information as we can from what we
150 # retrieved. First we extract the information common to all extractors,
151 # and latter we extract those that are Vimeo specific.
152 self.report_extraction(video_id)
154 # Extract the config JSON
156 config = webpage.split(' = {config:')[1].split(',assets:')[0]
157 config = json.loads(config)
159 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
160 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
162 if re.search('If so please provide the correct password.', webpage):
163 self._verify_video_password(url, video_id, webpage)
164 return self._real_extract(url)
166 raise ExtractorError(u'Unable to extract info section')
169 video_title = config["video"]["title"]
171 # Extract uploader and uploader_id
172 video_uploader = config["video"]["owner"]["name"]
173 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
175 # Extract video thumbnail
176 video_thumbnail = config["video"]["thumbnail"]
178 # Extract video description
179 video_description = get_element_by_attribute("itemprop", "description", webpage)
180 if video_description: video_description = clean_html(video_description)
181 else: video_description = u''
183 # Extract upload date
184 video_upload_date = None
185 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
187 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
189 # Vimeo specific: extract request signature and timestamp
190 sig = config['request']['signature']
191 timestamp = config['request']['timestamp']
193 # Vimeo specific: extract video codec and quality information
194 # First consider quality, then codecs, then take everything
195 # TODO bind to format param
196 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
197 files = { 'hd': [], 'sd': [], 'other': []}
198 for codec_name, codec_extension in codecs:
199 if codec_name in config["video"]["files"]:
200 if 'hd' in config["video"]["files"][codec_name]:
201 files['hd'].append((codec_name, codec_extension, 'hd'))
202 elif 'sd' in config["video"]["files"][codec_name]:
203 files['sd'].append((codec_name, codec_extension, 'sd'))
205 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
207 for quality in ('hd', 'sd', 'other'):
208 if len(files[quality]) > 0:
209 video_quality = files[quality][0][2]
210 video_codec = files[quality][0][0]
211 video_extension = files[quality][0][1]
212 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
215 raise ExtractorError(u'No known codec found')
217 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
218 %(video_id, sig, timestamp, video_quality, video_codec.upper())
223 'uploader': video_uploader,
224 'uploader_id': video_uploader_id,
225 'upload_date': video_upload_date,
226 'title': video_title,
227 'ext': video_extension,
228 'thumbnail': video_thumbnail,
229 'description': video_description,
233 class ArteTvIE(InfoExtractor):
234 """arte.tv information extractor."""
236 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
237 _LIVE_URL = r'index-[0-9]+\.html$'
241 def fetch_webpage(self, url):
242 request = compat_urllib_request.Request(url)
244 self.report_download_webpage(url)
245 webpage = compat_urllib_request.urlopen(request).read()
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
248 except ValueError as err:
249 raise ExtractorError(u'Invalid URL: %s' % url)
252 def grep_webpage(self, url, regex, regexFlags, matchTuples):
253 page = self.fetch_webpage(url)
254 mobj = re.search(regex, page, regexFlags)
258 raise ExtractorError(u'Invalid URL: %s' % url)
260 for (i, key, err) in matchTuples:
261 if mobj.group(i) is None:
262 raise ExtractorError(err)
264 info[key] = mobj.group(i)
268 def extractLiveStream(self, url):
269 video_lang = url.split('/')[-4]
270 info = self.grep_webpage(
272 r'src="(.*?/videothek_js.*?\.js)',
275 (1, 'url', u'Invalid URL: %s' % url)
278 http_host = url.split('/')[2]
279 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
280 info = self.grep_webpage(
282 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
283 '(http://.*?\.swf).*?' +
287 (1, 'path', u'could not extract video path: %s' % url),
288 (2, 'player', u'could not extract video player: %s' % url),
289 (3, 'url', u'could not extract video url: %s' % url)
292 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
294 def extractPlus7Stream(self, url):
295 video_lang = url.split('/')[-3]
296 info = self.grep_webpage(
298 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
301 (1, 'url', u'Invalid URL: %s' % url)
304 next_url = compat_urllib_parse.unquote(info.get('url'))
305 info = self.grep_webpage(
307 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
310 (1, 'url', u'Could not find <video> tag: %s' % url)
313 next_url = compat_urllib_parse.unquote(info.get('url'))
315 info = self.grep_webpage(
317 r'<video id="(.*?)".*?>.*?' +
318 '<name>(.*?)</name>.*?' +
319 '<dateVideo>(.*?)</dateVideo>.*?' +
320 '<url quality="hd">(.*?)</url>',
323 (1, 'id', u'could not extract video id: %s' % url),
324 (2, 'title', u'could not extract video title: %s' % url),
325 (3, 'date', u'could not extract video date: %s' % url),
326 (4, 'url', u'could not extract video url: %s' % url)
331 'id': info.get('id'),
332 'url': compat_urllib_parse.unquote(info.get('url')),
333 'uploader': u'arte.tv',
334 'upload_date': unified_strdate(info.get('date')),
335 'title': info.get('title').decode('utf-8'),
341 def _real_extract(self, url):
342 video_id = url.split('/')[-1]
343 self.report_extraction(video_id)
345 if re.search(self._LIVE_URL, video_id) is not None:
346 self.extractLiveStream(url)
349 info = self.extractPlus7Stream(url)
354 class GenericIE(InfoExtractor):
355 """Generic last-resort information extractor."""
360 def report_download_webpage(self, video_id):
361 """Report webpage download."""
362 if not self._downloader.params.get('test', False):
363 self._downloader.report_warning(u'Falling back on generic information extractor.')
364 super(GenericIE, self).report_download_webpage(video_id)
366 def report_following_redirect(self, new_url):
367 """Report information extraction."""
368 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
370 def _test_redirect(self, url):
371 """Check if it is a redirect, like url shorteners, in case return the new url."""
372 class HeadRequest(compat_urllib_request.Request):
373 def get_method(self):
376 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
378 Subclass the HTTPRedirectHandler to make it use our
379 HeadRequest also on the redirected URL
381 def redirect_request(self, req, fp, code, msg, headers, newurl):
382 if code in (301, 302, 303, 307):
383 newurl = newurl.replace(' ', '%20')
384 newheaders = dict((k,v) for k,v in req.headers.items()
385 if k.lower() not in ("content-length", "content-type"))
386 return HeadRequest(newurl,
388 origin_req_host=req.get_origin_req_host(),
391 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
393 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
395 Fallback to GET if HEAD is not allowed (405 HTTP error)
397 def http_error_405(self, req, fp, code, msg, headers):
401 newheaders = dict((k,v) for k,v in req.headers.items()
402 if k.lower() not in ("content-length", "content-type"))
403 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
405 origin_req_host=req.get_origin_req_host(),
409 opener = compat_urllib_request.OpenerDirector()
410 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
411 HTTPMethodFallback, HEADRedirectHandler,
412 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
413 opener.add_handler(handler())
415 response = opener.open(HeadRequest(url))
417 raise ExtractorError(u'Invalid URL protocol')
418 new_url = response.geturl()
423 self.report_following_redirect(new_url)
426 def _real_extract(self, url):
427 new_url = self._test_redirect(url)
428 if new_url: return [self.url_result(new_url)]
430 video_id = url.split('/')[-1]
432 webpage = self._download_webpage(url, video_id)
433 except ValueError as err:
434 # since this is the last-resort InfoExtractor, if
435 # this error is thrown, it'll be thrown here
436 raise ExtractorError(u'Invalid URL: %s' % url)
438 self.report_extraction(video_id)
439 # Start with something easy: JW Player in SWFObject
440 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
442 # Broaden the search a little bit
443 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
445 # Broaden the search a little bit: JWPlayer JS loader
446 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
448 # Try to find twitter cards info
449 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
451 # We look for Open Graph info:
452 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
453 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
454 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
455 if m_video_type is not None:
456 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
458 raise ExtractorError(u'Invalid URL: %s' % url)
460 # It's possible that one of the regexes
461 # matched, but returned an empty group:
462 if mobj.group(1) is None:
463 raise ExtractorError(u'Invalid URL: %s' % url)
465 video_url = compat_urllib_parse.unquote(mobj.group(1))
466 video_id = os.path.basename(video_url)
468 # here's a fun little line of code for you:
469 video_extension = os.path.splitext(video_id)[1][1:]
470 video_id = os.path.splitext(video_id)[0]
472 # it's tempting to parse this further, but you would
473 # have to take into account all the variations like
474 # Video Title - Site Name
475 # Site Name | Video Title
476 # Video Title - Tagline | Site Name
477 # and so on and so forth; it's just not practical
478 video_title = self._html_search_regex(r'<title>(.*)</title>',
479 webpage, u'video title')
481 # video uploader is domain name
482 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
483 url, u'video uploader')
488 'uploader': video_uploader,
490 'title': video_title,
491 'ext': video_extension,
495 class YoutubeSearchIE(SearchInfoExtractor):
496 """Information Extractor for YouTube search queries."""
497 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
499 IE_NAME = u'youtube:search'
500 _SEARCH_KEY = 'ytsearch'
502 def report_download_page(self, query, pagenum):
503 """Report attempt to download search page with given number."""
504 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
506 def _get_n_results(self, query, n):
507 """Get a specified number of results for a query"""
513 while (50 * pagenum) < limit:
514 self.report_download_page(query, pagenum+1)
515 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
516 request = compat_urllib_request.Request(result_url)
518 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
519 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
520 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
521 api_response = json.loads(data)['data']
523 if not 'items' in api_response:
524 raise ExtractorError(u'[youtube] No video results')
526 new_ids = list(video['id'] for video in api_response['items'])
529 limit = min(n, api_response['totalItems'])
532 if len(video_ids) > n:
533 video_ids = video_ids[:n]
534 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
535 return self.playlist_result(videos, query)
538 class GoogleSearchIE(SearchInfoExtractor):
539 """Information Extractor for Google Video search queries."""
540 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
542 IE_NAME = u'video.google:search'
543 _SEARCH_KEY = 'gvsearch'
545 def _get_n_results(self, query, n):
546 """Get a specified number of results for a query"""
554 for pagenum in itertools.count(1):
555 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
556 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
557 note='Downloading result page ' + str(pagenum))
559 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
564 res['entries'].append(e)
566 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
569 class YahooSearchIE(SearchInfoExtractor):
570 """Information Extractor for Yahoo! Video search queries."""
573 IE_NAME = u'screen.yahoo:search'
574 _SEARCH_KEY = 'yvsearch'
576 def _get_n_results(self, query, n):
577 """Get a specified number of results for a query"""
584 for pagenum in itertools.count(0):
585 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
586 webpage = self._download_webpage(result_url, query,
587 note='Downloading results page '+str(pagenum+1))
588 info = json.loads(webpage)
590 results = info[u'results']
592 for (i, r) in enumerate(results):
593 if (pagenum * 30) +i >= n:
595 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
596 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
597 res['entries'].append(e)
598 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
604 class BlipTVUserIE(InfoExtractor):
605 """Information Extractor for blip.tv users."""
607 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
609 IE_NAME = u'blip.tv:user'
611 def _real_extract(self, url):
613 mobj = re.match(self._VALID_URL, url)
615 raise ExtractorError(u'Invalid URL: %s' % url)
617 username = mobj.group(1)
619 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
621 page = self._download_webpage(url, username, u'Downloading user page')
622 mobj = re.search(r'data-users-id="([^"]+)"', page)
623 page_base = page_base % mobj.group(1)
626 # Download video ids using BlipTV Ajax calls. Result size per
627 # query is limited (currently to 12 videos) so we need to query
628 # page by page until there are no video ids - it means we got
635 url = page_base + "&page=" + str(pagenum)
636 page = self._download_webpage(url, username,
637 u'Downloading video ids from page %d' % pagenum)
639 # Extract video identifiers
642 for mobj in re.finditer(r'href="/([^"]+)"', page):
643 if mobj.group(1) not in ids_in_page:
644 ids_in_page.append(unescapeHTML(mobj.group(1)))
646 video_ids.extend(ids_in_page)
648 # A little optimization - if current page is not
649 # "full", ie. does not contain PAGE_SIZE video ids then
650 # we can assume that this page is the last one - there
651 # are no more ids on further pages - no need to query
654 if len(ids_in_page) < self._PAGE_SIZE:
659 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
660 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
661 return [self.playlist_result(url_entries, playlist_title = username)]
664 class DepositFilesIE(InfoExtractor):
665 """Information extractor for depositfiles.com"""
667 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
669 def _real_extract(self, url):
670 file_id = url.split('/')[-1]
671 # Rebuild url in english locale
672 url = 'http://depositfiles.com/en/files/' + file_id
674 # Retrieve file webpage with 'Free download' button pressed
675 free_download_indication = { 'gateway_result' : '1' }
676 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
678 self.report_download_webpage(file_id)
679 webpage = compat_urllib_request.urlopen(request).read()
680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
681 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
683 # Search for the real file URL
684 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
685 if (mobj is None) or (mobj.group(1) is None):
686 # Try to figure out reason of the error.
687 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
688 if (mobj is not None) and (mobj.group(1) is not None):
689 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
690 raise ExtractorError(u'%s' % restriction_message)
692 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
694 file_url = mobj.group(1)
695 file_extension = os.path.splitext(file_url)[1][1:]
697 # Search for file title
698 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
701 'id': file_id.decode('utf-8'),
702 'url': file_url.decode('utf-8'),
706 'ext': file_extension.decode('utf-8'),
710 class FacebookIE(InfoExtractor):
711 """Information Extractor for Facebook"""
713 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
714 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
715 _NETRC_MACHINE = 'facebook'
716 IE_NAME = u'facebook'
718 def report_login(self):
719 """Report attempt to log in."""
720 self.to_screen(u'Logging in')
722 def _real_initialize(self):
723 if self._downloader is None:
728 downloader_params = self._downloader.params
730 # Attempt to use provided username and password or .netrc data
731 if downloader_params.get('username', None) is not None:
732 useremail = downloader_params['username']
733 password = downloader_params['password']
734 elif downloader_params.get('usenetrc', False):
736 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
741 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
742 except (IOError, netrc.NetrcParseError) as err:
743 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
746 if useremail is None:
755 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
758 login_results = compat_urllib_request.urlopen(request).read()
759 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
760 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
762 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
763 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
766 def _real_extract(self, url):
767 mobj = re.match(self._VALID_URL, url)
769 raise ExtractorError(u'Invalid URL: %s' % url)
770 video_id = mobj.group('ID')
772 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
773 webpage = self._download_webpage(url, video_id)
775 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
776 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
777 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
779 raise ExtractorError(u'Cannot parse data')
780 data = dict(json.loads(m.group(1)))
781 params_raw = compat_urllib_parse.unquote(data['params'])
782 params = json.loads(params_raw)
783 video_data = params['video_data'][0]
784 video_url = video_data.get('hd_src')
786 video_url = video_data['sd_src']
788 raise ExtractorError(u'Cannot find video URL')
789 video_duration = int(video_data['video_duration'])
790 thumbnail = video_data['thumbnail_src']
792 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
797 'title': video_title,
800 'duration': video_duration,
801 'thumbnail': thumbnail,
806 class BlipTVIE(InfoExtractor):
807 """Information extractor for blip.tv"""
809 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
810 _URL_EXT = r'^.*\.([a-z0-9]+)$'
813 def report_direct_download(self, title):
814 """Report information extraction."""
815 self.to_screen(u'%s: Direct download detected' % title)
817 def _real_extract(self, url):
818 mobj = re.match(self._VALID_URL, url)
820 raise ExtractorError(u'Invalid URL: %s' % url)
822 # See https://github.com/rg3/youtube-dl/issues/857
823 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
824 if api_mobj is not None:
825 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
826 urlp = compat_urllib_parse_urlparse(url)
827 if urlp.path.startswith('/play/'):
828 request = compat_urllib_request.Request(url)
829 response = compat_urllib_request.urlopen(request)
830 redirecturl = response.geturl()
831 rurlp = compat_urllib_parse_urlparse(redirecturl)
832 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
833 url = 'http://blip.tv/a/a-' + file_id
834 return self._real_extract(url)
841 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
842 request = compat_urllib_request.Request(json_url)
843 request.add_header('User-Agent', 'iTunes/10.6.1')
844 self.report_extraction(mobj.group(1))
847 urlh = compat_urllib_request.urlopen(request)
848 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
849 basename = url.split('/')[-1]
850 title,ext = os.path.splitext(basename)
851 title = title.decode('UTF-8')
852 ext = ext.replace('.', '')
853 self.report_direct_download(title)
863 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
864 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
865 if info is None: # Regular URL
867 json_code_bytes = urlh.read()
868 json_code = json_code_bytes.decode('utf-8')
869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
870 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
873 json_data = json.loads(json_code)
874 if 'Post' in json_data:
875 data = json_data['Post']
879 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
880 video_url = data['media']['url']
881 umobj = re.match(self._URL_EXT, video_url)
883 raise ValueError('Can not determine filename extension')
887 'id': data['item_id'],
889 'uploader': data['display_name'],
890 'upload_date': upload_date,
891 'title': data['title'],
893 'format': data['media']['mimeType'],
894 'thumbnail': data['thumbnailUrl'],
895 'description': data['description'],
896 'player_url': data['embedUrl'],
897 'user_agent': 'iTunes/10.6.1',
899 except (ValueError,KeyError) as err:
900 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
905 class MyVideoIE(InfoExtractor):
906 """Information Extractor for myvideo.de."""
908 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
911 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
912 # Released into the Public Domain by Tristan Fischer on 2013-05-19
913 # https://github.com/rg3/youtube-dl/pull/842
914 def __rc4crypt(self,data, key):
916 box = list(range(256))
917 for i in list(range(256)):
918 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
919 box[i], box[x] = box[x], box[i]
925 y = (y + box[x]) % 256
926 box[x], box[y] = box[y], box[x]
927 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
931 return hashlib.md5(s).hexdigest().encode()
933 def _real_extract(self,url):
934 mobj = re.match(self._VALID_URL, url)
936 raise ExtractorError(u'invalid URL: %s' % url)
938 video_id = mobj.group(1)
941 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
942 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
943 b'TnpsbA0KTVRkbU1tSTRNdz09'
947 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
948 webpage = self._download_webpage(webpage_url, video_id)
950 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
952 self.report_extraction(video_id)
953 video_url = mobj.group(1) + '.flv'
955 video_title = self._html_search_regex('<title>([^<]+)</title>',
958 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
965 'title': video_title,
970 mobj = re.search('var flashvars={(.+?)}', webpage)
972 raise ExtractorError(u'Unable to extract video')
977 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
978 if not a == '_encxml':
981 encxml = compat_urllib_parse.unquote(b)
982 if not params.get('domain'):
983 params['domain'] = 'www.myvideo.de'
984 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
985 if 'flash_playertype=MTV' in xmldata_url:
986 self._downloader.report_warning(u'avoiding MTV player')
988 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
989 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
993 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
994 enc_data_b = binascii.unhexlify(enc_data)
996 base64.b64decode(base64.b64decode(GK)) +
998 str(video_id).encode('utf-8')
1001 dec_data = self.__rc4crypt(enc_data_b, sk)
1004 self.report_extraction(video_id)
1007 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1009 video_url = compat_urllib_parse.unquote(mobj.group(1))
1010 if 'myvideo2flash' in video_url:
1011 self._downloader.report_warning(u'forcing RTMPT ...')
1012 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1015 # extract non rtmp videos
1016 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1018 raise ExtractorError(u'unable to extract url')
1019 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1021 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1022 video_file = compat_urllib_parse.unquote(video_file)
1024 if not video_file.endswith('f4m'):
1025 ppath, prefix = video_file.split('.')
1026 video_playpath = '%s:%s' % (prefix, ppath)
1027 video_hls_playlist = ''
1030 video_hls_playlist = (
1031 video_filepath + video_file
1032 ).replace('.f4m', '.m3u8')
1034 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1035 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1037 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1043 'tc_url': video_url,
1045 'upload_date': None,
1046 'title': video_title,
1048 'play_path': video_playpath,
1049 'video_file': video_file,
1050 'video_hls_playlist': video_hls_playlist,
1051 'player_url': video_swfobj,
1055 class ComedyCentralIE(InfoExtractor):
1056 """Information extractor for The Daily Show and Colbert Report """
1058 # urls can be abbreviations like :thedailyshow or :colbert
1059 # urls for episodes like:
1060 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
1061 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
1062 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
1063 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
1064 |(https?://)?(www\.)?
1065 (?P<showname>thedailyshow|colbertnation)\.com/
1066 (full-episodes/(?P<episode>.*)|
1068 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1069 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1072 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1074 _video_extensions = {
1082 _video_dimensions = {
1092 def suitable(cls, url):
1093 """Receives a URL and returns True if suitable for this IE."""
1094 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1096 def _print_formats(self, formats):
1097 print('Available formats:')
1099 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1102 def _real_extract(self, url):
1103 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1105 raise ExtractorError(u'Invalid URL: %s' % url)
1107 if mobj.group('shortname'):
1108 if mobj.group('shortname') in ('tds', 'thedailyshow'):
1109 url = u'http://www.thedailyshow.com/full-episodes/'
1111 url = u'http://www.colbertnation.com/full-episodes/'
1112 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1113 assert mobj is not None
1115 if mobj.group('clip'):
1116 if mobj.group('showname') == 'thedailyshow':
1117 epTitle = mobj.group('tdstitle')
1119 epTitle = mobj.group('cntitle')
1122 dlNewest = not mobj.group('episode')
1124 epTitle = mobj.group('showname')
1126 epTitle = mobj.group('episode')
1128 self.report_extraction(epTitle)
1129 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1131 url = htmlHandle.geturl()
1132 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1134 raise ExtractorError(u'Invalid redirected URL: ' + url)
1135 if mobj.group('episode') == '':
1136 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1137 epTitle = mobj.group('episode')
1139 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1141 if len(mMovieParams) == 0:
1142 # The Colbert Report embeds the information in a without
1143 # a URL prefix; so extract the alternate reference
1144 # and then add the URL prefix manually.
1146 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1147 if len(altMovieParams) == 0:
1148 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1150 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1152 uri = mMovieParams[0][1]
1153 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1154 indexXml = self._download_webpage(indexUrl, epTitle,
1155 u'Downloading show index',
1156 u'unable to download episode index')
1160 idoc = xml.etree.ElementTree.fromstring(indexXml)
1161 itemEls = idoc.findall('.//item')
1162 for partNum,itemEl in enumerate(itemEls):
1163 mediaId = itemEl.findall('./guid')[0].text
1164 shortMediaId = mediaId.split(':')[-1]
1165 showId = mediaId.split(':')[-2].replace('.com', '')
1166 officialTitle = itemEl.findall('./title')[0].text
1167 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1169 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1170 compat_urllib_parse.urlencode({'uri': mediaId}))
1171 configXml = self._download_webpage(configUrl, epTitle,
1172 u'Downloading configuration for %s' % shortMediaId)
1174 cdoc = xml.etree.ElementTree.fromstring(configXml)
1176 for rendition in cdoc.findall('.//rendition'):
1177 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1181 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1184 if self._downloader.params.get('listformats', None):
1185 self._print_formats([i[0] for i in turls])
1188 # For now, just pick the highest bitrate
1189 format,rtmp_video_url = turls[-1]
1191 # Get the format arg from the arg stream
1192 req_format = self._downloader.params.get('format', None)
1194 # Select format if we can find one
1197 format, rtmp_video_url = f, v
1200 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1202 raise ExtractorError(u'Cannot transform RTMP url')
1203 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1204 video_url = base + m.group('finalid')
1206 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1211 'upload_date': officialDate,
1216 'description': officialTitle,
1218 results.append(info)
1223 class EscapistIE(InfoExtractor):
1224 """Information extractor for The Escapist """
1226 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1227 IE_NAME = u'escapist'
1229 def _real_extract(self, url):
1230 mobj = re.match(self._VALID_URL, url)
1232 raise ExtractorError(u'Invalid URL: %s' % url)
1233 showName = mobj.group('showname')
1234 videoId = mobj.group('episode')
1236 self.report_extraction(videoId)
1237 webpage = self._download_webpage(url, videoId)
1239 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1240 webpage, u'description', fatal=False)
1242 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1243 webpage, u'thumbnail', fatal=False)
1245 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1246 webpage, u'player url')
1248 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1249 webpage, u'player url').split(' : ')[-1]
1251 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1252 configUrl = compat_urllib_parse.unquote(configUrl)
1254 configJSON = self._download_webpage(configUrl, videoId,
1255 u'Downloading configuration',
1256 u'unable to download configuration')
1258 # Technically, it's JavaScript, not JSON
1259 configJSON = configJSON.replace("'", '"')
1262 config = json.loads(configJSON)
1263 except (ValueError,) as err:
1264 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1266 playlist = config['playlist']
1267 videoUrl = playlist[1]['url']
1272 'uploader': showName,
1273 'upload_date': None,
1276 'thumbnail': imgUrl,
1277 'description': videoDesc,
1278 'player_url': playerUrl,
1283 class CollegeHumorIE(InfoExtractor):
1284 """Information extractor for collegehumor.com"""
1287 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1288 IE_NAME = u'collegehumor'
1290 def report_manifest(self, video_id):
1291 """Report information extraction."""
1292 self.to_screen(u'%s: Downloading XML manifest' % video_id)
1294 def _real_extract(self, url):
1295 mobj = re.match(self._VALID_URL, url)
1297 raise ExtractorError(u'Invalid URL: %s' % url)
1298 video_id = mobj.group('videoid')
1303 'upload_date': None,
1306 self.report_extraction(video_id)
1307 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1309 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1311 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1313 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1315 videoNode = mdoc.findall('./video')[0]
1316 info['description'] = videoNode.findall('./description')[0].text
1317 info['title'] = videoNode.findall('./caption')[0].text
1318 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1319 manifest_url = videoNode.findall('./file')[0].text
1321 raise ExtractorError(u'Invalid metadata XML file')
1323 manifest_url += '?hdcore=2.10.3'
1324 self.report_manifest(video_id)
1326 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1327 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1328 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1330 adoc = xml.etree.ElementTree.fromstring(manifestXml)
1332 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1333 node_id = media_node.attrib['url']
1334 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1335 except IndexError as err:
1336 raise ExtractorError(u'Invalid manifest file')
1338 url_pr = compat_urllib_parse_urlparse(manifest_url)
1339 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1346 class XVideosIE(InfoExtractor):
1347 """Information extractor for xvideos.com"""
1349 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1350 IE_NAME = u'xvideos'
1352 def _real_extract(self, url):
1353 mobj = re.match(self._VALID_URL, url)
1355 raise ExtractorError(u'Invalid URL: %s' % url)
1356 video_id = mobj.group(1)
1358 webpage = self._download_webpage(url, video_id)
1360 self.report_extraction(video_id)
1363 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1364 webpage, u'video URL'))
1367 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1370 # Extract video thumbnail
1371 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1372 webpage, u'thumbnail', fatal=False)
1378 'upload_date': None,
1379 'title': video_title,
1381 'thumbnail': video_thumbnail,
1382 'description': None,
1388 class SoundcloudIE(InfoExtractor):
1389 """Information extractor for soundcloud.com
1390 To access the media, the uid of the song and a stream token
1391 must be extracted from the page source and the script must make
1392 a request to media.soundcloud.com/crossdomain.xml. Then
1393 the media can be grabbed by requesting from an url composed
1394 of the stream token and uid
1397 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1398 IE_NAME = u'soundcloud'
1400 def report_resolve(self, video_id):
1401 """Report information extraction."""
1402 self.to_screen(u'%s: Resolving id' % video_id)
1404 def _real_extract(self, url):
1405 mobj = re.match(self._VALID_URL, url)
1407 raise ExtractorError(u'Invalid URL: %s' % url)
1409 # extract uploader (which is in the url)
1410 uploader = mobj.group(1)
1411 # extract simple title (uploader + slug of song title)
1412 slug_title = mobj.group(2)
1413 simple_title = uploader + u'-' + slug_title
1414 full_title = '%s/%s' % (uploader, slug_title)
1416 self.report_resolve(full_title)
1418 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1419 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1420 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1422 info = json.loads(info_json)
1423 video_id = info['id']
1424 self.report_extraction(full_title)
1426 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1427 stream_json = self._download_webpage(streams_url, full_title,
1428 u'Downloading stream definitions',
1429 u'unable to download stream definitions')
1431 streams = json.loads(stream_json)
1432 mediaURL = streams['http_mp3_128_url']
1433 upload_date = unified_strdate(info['created_at'])
1438 'uploader': info['user']['username'],
1439 'upload_date': upload_date,
1440 'title': info['title'],
1442 'description': info['description'],
1445 class SoundcloudSetIE(InfoExtractor):
1446 """Information extractor for soundcloud.com sets
1447 To access the media, the uid of the song and a stream token
1448 must be extracted from the page source and the script must make
1449 a request to media.soundcloud.com/crossdomain.xml. Then
1450 the media can be grabbed by requesting from an url composed
1451 of the stream token and uid
1454 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1455 IE_NAME = u'soundcloud:set'
1457 def report_resolve(self, video_id):
1458 """Report information extraction."""
1459 self.to_screen(u'%s: Resolving id' % video_id)
1461 def _real_extract(self, url):
1462 mobj = re.match(self._VALID_URL, url)
1464 raise ExtractorError(u'Invalid URL: %s' % url)
1466 # extract uploader (which is in the url)
1467 uploader = mobj.group(1)
1468 # extract simple title (uploader + slug of song title)
1469 slug_title = mobj.group(2)
1470 simple_title = uploader + u'-' + slug_title
1471 full_title = '%s/sets/%s' % (uploader, slug_title)
1473 self.report_resolve(full_title)
1475 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1476 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1477 info_json = self._download_webpage(resolv_url, full_title)
1480 info = json.loads(info_json)
1481 if 'errors' in info:
1482 for err in info['errors']:
1483 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1486 self.report_extraction(full_title)
1487 for track in info['tracks']:
1488 video_id = track['id']
1490 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1491 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1493 self.report_extraction(video_id)
1494 streams = json.loads(stream_json)
1495 mediaURL = streams['http_mp3_128_url']
1500 'uploader': track['user']['username'],
1501 'upload_date': unified_strdate(track['created_at']),
1502 'title': track['title'],
1504 'description': track['description'],
1509 class InfoQIE(InfoExtractor):
1510 """Information extractor for infoq.com"""
1511 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1513 def _real_extract(self, url):
1514 mobj = re.match(self._VALID_URL, url)
1516 raise ExtractorError(u'Invalid URL: %s' % url)
1518 webpage = self._download_webpage(url, video_id=url)
1519 self.report_extraction(url)
1522 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1524 raise ExtractorError(u'Unable to extract video url')
1525 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1526 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1529 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1532 # Extract description
1533 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1534 webpage, u'description', fatal=False)
1536 video_filename = video_url.split('/')[-1]
1537 video_id, extension = video_filename.split('.')
1543 'upload_date': None,
1544 'title': video_title,
1545 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1547 'description': video_description,
1552 class MixcloudIE(InfoExtractor):
1553 """Information extractor for www.mixcloud.com"""
1555 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1556 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1557 IE_NAME = u'mixcloud'
1559 def report_download_json(self, file_id):
1560 """Report JSON download."""
1561 self.to_screen(u'Downloading json')
1563 def get_urls(self, jsonData, fmt, bitrate='best'):
1564 """Get urls from 'audio_formats' section in json"""
1567 bitrate_list = jsonData[fmt]
1568 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1569 bitrate = max(bitrate_list) # select highest
1571 url_list = jsonData[fmt][bitrate]
1572 except TypeError: # we have no bitrate info.
1573 url_list = jsonData[fmt]
1576 def check_urls(self, url_list):
1577 """Returns 1st active url from list"""
1578 for url in url_list:
1580 compat_urllib_request.urlopen(url)
1582 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1587 def _print_formats(self, formats):
1588 print('Available formats:')
1589 for fmt in formats.keys():
1590 for b in formats[fmt]:
1592 ext = formats[fmt][b][0]
1593 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1594 except TypeError: # we have no bitrate info
1595 ext = formats[fmt][0]
1596 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1599 def _real_extract(self, url):
1600 mobj = re.match(self._VALID_URL, url)
1602 raise ExtractorError(u'Invalid URL: %s' % url)
1603 # extract uploader & filename from url
1604 uploader = mobj.group(1).decode('utf-8')
1605 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1607 # construct API request
1608 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1609 # retrieve .json file with links to files
1610 request = compat_urllib_request.Request(file_url)
1612 self.report_download_json(file_url)
1613 jsonData = compat_urllib_request.urlopen(request).read()
1614 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1615 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1618 json_data = json.loads(jsonData)
1619 player_url = json_data['player_swf_url']
1620 formats = dict(json_data['audio_formats'])
1622 req_format = self._downloader.params.get('format', None)
1625 if self._downloader.params.get('listformats', None):
1626 self._print_formats(formats)
1629 if req_format is None or req_format == 'best':
1630 for format_param in formats.keys():
1631 url_list = self.get_urls(formats, format_param)
1633 file_url = self.check_urls(url_list)
1634 if file_url is not None:
1637 if req_format not in formats:
1638 raise ExtractorError(u'Format is not available')
1640 url_list = self.get_urls(formats, req_format)
1641 file_url = self.check_urls(url_list)
1642 format_param = req_format
1645 'id': file_id.decode('utf-8'),
1646 'url': file_url.decode('utf-8'),
1647 'uploader': uploader.decode('utf-8'),
1648 'upload_date': None,
1649 'title': json_data['name'],
1650 'ext': file_url.split('.')[-1].decode('utf-8'),
1651 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1652 'thumbnail': json_data['thumbnail_url'],
1653 'description': json_data['description'],
1654 'player_url': player_url.decode('utf-8'),
1657 class StanfordOpenClassroomIE(InfoExtractor):
1658 """Information extractor for Stanford's Open ClassRoom"""
1660 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1661 IE_NAME = u'stanfordoc'
1663 def _real_extract(self, url):
1664 mobj = re.match(self._VALID_URL, url)
1666 raise ExtractorError(u'Invalid URL: %s' % url)
1668 if mobj.group('course') and mobj.group('video'): # A specific video
1669 course = mobj.group('course')
1670 video = mobj.group('video')
1672 'id': course + '_' + video,
1674 'upload_date': None,
1677 self.report_extraction(info['id'])
1678 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1679 xmlUrl = baseUrl + video + '.xml'
1681 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1682 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1683 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1684 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1686 info['title'] = mdoc.findall('./title')[0].text
1687 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1689 raise ExtractorError(u'Invalid metadata XML file')
1690 info['ext'] = info['url'].rpartition('.')[2]
1692 elif mobj.group('course'): # A course page
1693 course = mobj.group('course')
1698 'upload_date': None,
1701 coursepage = self._download_webpage(url, info['id'],
1702 note='Downloading course info page',
1703 errnote='Unable to download course info page')
1705 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1707 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1708 coursepage, u'description', fatal=False)
1710 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1713 'type': 'reference',
1714 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1718 for entry in info['list']:
1719 assert entry['type'] == 'reference'
1720 results += self.extract(entry['url'])
1724 'id': 'Stanford OpenClassroom',
1727 'upload_date': None,
1730 self.report_download_webpage(info['id'])
1731 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1733 rootpage = compat_urllib_request.urlopen(rootURL).read()
1734 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1735 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1737 info['title'] = info['id']
1739 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1742 'type': 'reference',
1743 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1748 for entry in info['list']:
1749 assert entry['type'] == 'reference'
1750 results += self.extract(entry['url'])
1753 class MTVIE(InfoExtractor):
1754 """Information extractor for MTV.com"""
1756 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1759 def _real_extract(self, url):
1760 mobj = re.match(self._VALID_URL, url)
1762 raise ExtractorError(u'Invalid URL: %s' % url)
1763 if not mobj.group('proto'):
1764 url = 'http://' + url
1765 video_id = mobj.group('videoid')
1767 webpage = self._download_webpage(url, video_id)
1769 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1770 webpage, u'song name', fatal=False)
1772 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1775 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1776 webpage, u'mtvn_uri', fatal=False)
1778 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1779 webpage, u'content id', fatal=False)
1781 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1782 self.report_extraction(video_id)
1783 request = compat_urllib_request.Request(videogen_url)
1785 metadataXml = compat_urllib_request.urlopen(request).read()
1786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1787 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1789 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1790 renditions = mdoc.findall('.//rendition')
1792 # For now, always pick the highest quality.
1793 rendition = renditions[-1]
1796 _,_,ext = rendition.attrib['type'].partition('/')
1797 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1798 video_url = rendition.find('./src').text
1800 raise ExtractorError('Invalid rendition field.')
1805 'uploader': performer,
1806 'upload_date': None,
1807 'title': video_title,
1815 class YoukuIE(InfoExtractor):
1816 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1819 nowTime = int(time.time() * 1000)
1820 random1 = random.randint(1000,1998)
1821 random2 = random.randint(1000,9999)
1823 return "%d%d%d" %(nowTime,random1,random2)
1825 def _get_file_ID_mix_string(self, seed):
1827 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1829 for i in range(len(source)):
1830 seed = (seed * 211 + 30031 ) % 65536
1831 index = math.floor(seed / 65536 * len(source) )
1832 mixed.append(source[int(index)])
1833 source.remove(source[int(index)])
1834 #return ''.join(mixed)
1837 def _get_file_id(self, fileId, seed):
1838 mixed = self._get_file_ID_mix_string(seed)
1839 ids = fileId.split('*')
1843 realId.append(mixed[int(ch)])
1844 return ''.join(realId)
1846 def _real_extract(self, url):
1847 mobj = re.match(self._VALID_URL, url)
1849 raise ExtractorError(u'Invalid URL: %s' % url)
1850 video_id = mobj.group('ID')
1852 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1854 jsondata = self._download_webpage(info_url, video_id)
1856 self.report_extraction(video_id)
1858 config = json.loads(jsondata)
1860 video_title = config['data'][0]['title']
1861 seed = config['data'][0]['seed']
1863 format = self._downloader.params.get('format', None)
1864 supported_format = list(config['data'][0]['streamfileids'].keys())
1866 if format is None or format == 'best':
1867 if 'hd2' in supported_format:
1872 elif format == 'worst':
1880 fileid = config['data'][0]['streamfileids'][format]
1881 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1882 except (UnicodeDecodeError, ValueError, KeyError):
1883 raise ExtractorError(u'Unable to extract info section')
1886 sid = self._gen_sid()
1887 fileid = self._get_file_id(fileid, seed)
1889 #column 8,9 of fileid represent the segment number
1890 #fileid[7:9] should be changed
1891 for index, key in enumerate(keys):
1893 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1894 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1897 'id': '%s_part%02d' % (video_id, index),
1898 'url': download_url,
1900 'upload_date': None,
1901 'title': video_title,
1904 files_info.append(info)
1909 class XNXXIE(InfoExtractor):
1910 """Information extractor for xnxx.com"""
1912 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1914 VIDEO_URL_RE = r'flv_url=(.*?)&'
1915 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1916 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1918 def _real_extract(self, url):
1919 mobj = re.match(self._VALID_URL, url)
1921 raise ExtractorError(u'Invalid URL: %s' % url)
1922 video_id = mobj.group(1)
1924 # Get webpage content
1925 webpage = self._download_webpage(url, video_id)
1927 video_url = self._search_regex(self.VIDEO_URL_RE,
1928 webpage, u'video URL')
1929 video_url = compat_urllib_parse.unquote(video_url)
1931 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1934 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1935 webpage, u'thumbnail', fatal=False)
1941 'upload_date': None,
1942 'title': video_title,
1944 'thumbnail': video_thumbnail,
1945 'description': None,
1949 class GooglePlusIE(InfoExtractor):
1950 """Information extractor for plus.google.com."""
1952 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1953 IE_NAME = u'plus.google'
1955 def _real_extract(self, url):
1956 # Extract id from URL
1957 mobj = re.match(self._VALID_URL, url)
1959 raise ExtractorError(u'Invalid URL: %s' % url)
1961 post_url = mobj.group(0)
1962 video_id = mobj.group(1)
1964 video_extension = 'flv'
1966 # Step 1, Retrieve post webpage to extract further information
1967 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1969 self.report_extraction(video_id)
1971 # Extract update date
1972 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1973 webpage, u'upload date', fatal=False)
1975 # Convert timestring to a format suitable for filename
1976 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1977 upload_date = upload_date.strftime('%Y%m%d')
1980 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1981 webpage, u'uploader', fatal=False)
1984 # Get the first line for title
1985 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1986 webpage, 'title', default=u'NA')
1988 # Step 2, Stimulate clicking the image box to launch video
1989 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1990 webpage, u'video page URL')
1991 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1993 # Extract video links on video page
1994 """Extract video links of all sizes"""
1995 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1996 mobj = re.findall(pattern, webpage)
1998 raise ExtractorError(u'Unable to extract video links')
2000 # Sort in resolution
2001 links = sorted(mobj)
2003 # Choose the lowest of the sort, i.e. highest resolution
2004 video_url = links[-1]
2005 # Only get the url. The resolution part in the tuple has no use anymore
2006 video_url = video_url[-1]
2007 # Treat escaped \u0026 style hex
2009 video_url = video_url.decode("unicode_escape")
2010 except AttributeError: # Python 3
2011 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2017 'uploader': uploader,
2018 'upload_date': upload_date,
2019 'title': video_title,
2020 'ext': video_extension,
2023 class NBAIE(InfoExtractor):
2024 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2027 def _real_extract(self, url):
2028 mobj = re.match(self._VALID_URL, url)
2030 raise ExtractorError(u'Invalid URL: %s' % url)
2032 video_id = mobj.group(1)
2034 webpage = self._download_webpage(url, video_id)
2036 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2038 shortened_video_id = video_id.rpartition('/')[2]
2039 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2040 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2042 # It isn't there in the HTML it returns to us
2043 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2045 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2048 'id': shortened_video_id,
2052 # 'uploader_date': uploader_date,
2053 'description': description,
2057 class JustinTVIE(InfoExtractor):
2058 """Information extractor for justin.tv and twitch.tv"""
2059 # TODO: One broadcast may be split into multiple videos. The key
2060 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
2061 # starts at 1 and increases. Can we treat all parts as one video?
2063 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
2065 (?P<channelid>[^/]+)|
2066 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2067 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2071 _JUSTIN_PAGE_LIMIT = 100
2072 IE_NAME = u'justin.tv'
2074 def report_download_page(self, channel, offset):
2075 """Report attempt to download a single page of videos."""
2076 self.to_screen(u'%s: Downloading video information from %d to %d' %
2077 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2079 # Return count of items, list of *valid* items
2080 def _parse_page(self, url, video_id):
2081 webpage = self._download_webpage(url, video_id,
2082 u'Downloading video info JSON',
2083 u'unable to download video info JSON')
2085 response = json.loads(webpage)
2086 if type(response) != list:
2087 error_text = response.get('error', 'unknown error')
2088 raise ExtractorError(u'Justin.tv API: %s' % error_text)
2090 for clip in response:
2091 video_url = clip['video_file_url']
2093 video_extension = os.path.splitext(video_url)[1][1:]
2094 video_date = re.sub('-', '', clip['start_time'][:10])
2095 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2096 video_id = clip['id']
2097 video_title = clip.get('title', video_id)
2101 'title': video_title,
2102 'uploader': clip.get('channel_name', video_uploader_id),
2103 'uploader_id': video_uploader_id,
2104 'upload_date': video_date,
2105 'ext': video_extension,
2107 return (len(response), info)
2109 def _real_extract(self, url):
2110 mobj = re.match(self._VALID_URL, url)
2112 raise ExtractorError(u'invalid URL: %s' % url)
2114 api_base = 'http://api.justin.tv'
2116 if mobj.group('channelid'):
2118 video_id = mobj.group('channelid')
2119 api = api_base + '/channel/archives/%s.json' % video_id
2120 elif mobj.group('chapterid'):
2121 chapter_id = mobj.group('chapterid')
2123 webpage = self._download_webpage(url, chapter_id)
2124 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2126 raise ExtractorError(u'Cannot find archive of a chapter')
2127 archive_id = m.group(1)
2129 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2130 chapter_info_xml = self._download_webpage(api, chapter_id,
2131 note=u'Downloading chapter information',
2132 errnote=u'Chapter information download failed')
2133 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2134 for a in doc.findall('.//archive'):
2135 if archive_id == a.find('./id').text:
2138 raise ExtractorError(u'Could not find chapter in chapter information')
2140 video_url = a.find('./video_file_url').text
2141 video_ext = video_url.rpartition('.')[2] or u'flv'
2143 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2144 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2145 note='Downloading chapter metadata',
2146 errnote='Download of chapter metadata failed')
2147 chapter_info = json.loads(chapter_info_json)
2149 bracket_start = int(doc.find('.//bracket_start').text)
2150 bracket_end = int(doc.find('.//bracket_end').text)
2152 # TODO determine start (and probably fix up file)
2153 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2154 #video_url += u'?start=' + TODO:start_timestamp
2155 # bracket_start is 13290, but we want 51670615
2156 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2157 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2160 'id': u'c' + chapter_id,
2163 'title': chapter_info['title'],
2164 'thumbnail': chapter_info['preview'],
2165 'description': chapter_info['description'],
2166 'uploader': chapter_info['channel']['display_name'],
2167 'uploader_id': chapter_info['channel']['name'],
2171 video_id = mobj.group('videoid')
2172 api = api_base + '/broadcast/by_archive/%s.json' % video_id
2174 self.report_extraction(video_id)
2178 limit = self._JUSTIN_PAGE_LIMIT
2181 self.report_download_page(video_id, offset)
2182 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2183 page_count, page_info = self._parse_page(page_url, video_id)
2184 info.extend(page_info)
2185 if not paged or page_count != limit:
2190 class FunnyOrDieIE(InfoExtractor):
2191 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2193 def _real_extract(self, url):
2194 mobj = re.match(self._VALID_URL, url)
2196 raise ExtractorError(u'invalid URL: %s' % url)
2198 video_id = mobj.group('id')
2199 webpage = self._download_webpage(url, video_id)
2201 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2202 webpage, u'video URL', flags=re.DOTALL)
2204 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2205 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2207 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2208 webpage, u'description', fatal=False, flags=re.DOTALL)
2215 'description': video_description,
2219 class SteamIE(InfoExtractor):
2220 _VALID_URL = r"""http://store\.steampowered\.com/
2222 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2224 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2226 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2227 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2230 def suitable(cls, url):
2231 """Receives a URL and returns True if suitable for this IE."""
2232 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2234 def _real_extract(self, url):
2235 m = re.match(self._VALID_URL, url, re.VERBOSE)
2236 gameID = m.group('gameID')
2238 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2239 webpage = self._download_webpage(videourl, gameID)
2241 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2242 videourl = self._AGECHECK_TEMPLATE % gameID
2243 self.report_age_confirmation()
2244 webpage = self._download_webpage(videourl, gameID)
2246 self.report_extraction(gameID)
2247 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2248 webpage, 'game title')
2250 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2251 mweb = re.finditer(urlRE, webpage)
2252 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2253 titles = re.finditer(namesRE, webpage)
2254 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2255 thumbs = re.finditer(thumbsRE, webpage)
2257 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2258 video_id = vid.group('videoID')
2259 title = vtitle.group('videoName')
2260 video_url = vid.group('videoURL')
2261 video_thumb = thumb.group('thumbnail')
2263 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2268 'title': unescapeHTML(title),
2269 'thumbnail': video_thumb
2272 return [self.playlist_result(videos, gameID, game_title)]
2274 class UstreamIE(InfoExtractor):
2275 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2276 IE_NAME = u'ustream'
2278 def _real_extract(self, url):
2279 m = re.match(self._VALID_URL, url)
2280 video_id = m.group('videoID')
2282 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2283 webpage = self._download_webpage(url, video_id)
2285 self.report_extraction(video_id)
2287 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2290 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2291 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2293 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2294 webpage, u'thumbnail', fatal=False)
2300 'title': video_title,
2301 'uploader': uploader,
2302 'thumbnail': thumbnail,
2306 class WorldStarHipHopIE(InfoExtractor):
2307 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2308 IE_NAME = u'WorldStarHipHop'
2310 def _real_extract(self, url):
2311 m = re.match(self._VALID_URL, url)
2312 video_id = m.group('id')
2314 webpage_src = self._download_webpage(url, video_id)
2316 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2317 webpage_src, u'video URL')
2319 if 'mp4' in video_url:
2324 video_title = self._html_search_regex(r"<title>(.*)</title>",
2325 webpage_src, u'title')
2327 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2328 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2329 webpage_src, u'thumbnail', fatal=False)
2332 _title = r"""candytitles.*>(.*)</span>"""
2333 mobj = re.search(_title, webpage_src)
2334 if mobj is not None:
2335 video_title = mobj.group(1)
2340 'title' : video_title,
2341 'thumbnail' : thumbnail,
2346 class RBMARadioIE(InfoExtractor):
2347 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2349 def _real_extract(self, url):
2350 m = re.match(self._VALID_URL, url)
2351 video_id = m.group('videoID')
2353 webpage = self._download_webpage(url, video_id)
2355 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2356 webpage, u'json data', flags=re.MULTILINE)
2359 data = json.loads(json_data)
2360 except ValueError as e:
2361 raise ExtractorError(u'Invalid JSON: ' + str(e))
2363 video_url = data['akamai_url'] + '&cbr=256'
2364 url_parts = compat_urllib_parse_urlparse(video_url)
2365 video_ext = url_parts.path.rpartition('.')[2]
2370 'title': data['title'],
2371 'description': data.get('teaser_text'),
2372 'location': data.get('country_of_origin'),
2373 'uploader': data.get('host', {}).get('name'),
2374 'uploader_id': data.get('host', {}).get('slug'),
2375 'thumbnail': data.get('image', {}).get('large_url_2x'),
2376 'duration': data.get('duration'),
2381 class YouPornIE(InfoExtractor):
2382 """Information extractor for youporn.com."""
2383 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2385 def _print_formats(self, formats):
2386 """Print all available formats"""
2387 print(u'Available formats:')
2388 print(u'ext\t\tformat')
2389 print(u'---------------------------------')
2390 for format in formats:
2391 print(u'%s\t\t%s' % (format['ext'], format['format']))
2393 def _specific(self, req_format, formats):
2395 if(x["format"]==req_format):
2399 def _real_extract(self, url):
2400 mobj = re.match(self._VALID_URL, url)
2402 raise ExtractorError(u'Invalid URL: %s' % url)
2403 video_id = mobj.group('videoid')
2405 req = compat_urllib_request.Request(url)
2406 req.add_header('Cookie', 'age_verified=1')
2407 webpage = self._download_webpage(req, video_id)
2409 # Get JSON parameters
2410 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2412 params = json.loads(json_params)
2414 raise ExtractorError(u'Invalid JSON')
2416 self.report_extraction(video_id)
2418 video_title = params['title']
2419 upload_date = unified_strdate(params['release_date_f'])
2420 video_description = params['description']
2421 video_uploader = params['submitted_by']
2422 thumbnail = params['thumbnails'][0]['image']
2424 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2426 # Get all of the formats available
2427 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2428 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2429 webpage, u'download list').strip()
2431 # Get all of the links from the page
2432 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2433 links = re.findall(LINK_RE, download_list_html)
2434 if(len(links) == 0):
2435 raise ExtractorError(u'ERROR: no known formats available for video')
2437 self.to_screen(u'Links found: %d' % len(links))
2442 # A link looks like this:
2443 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2444 # A path looks like this:
2445 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2446 video_url = unescapeHTML( link )
2447 path = compat_urllib_parse_urlparse( video_url ).path
2448 extension = os.path.splitext( path )[1][1:]
2449 format = path.split('/')[4].split('_')[:2]
2452 format = "-".join( format )
2453 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2458 'uploader': video_uploader,
2459 'upload_date': upload_date,
2460 'title': video_title,
2463 'thumbnail': thumbnail,
2464 'description': video_description
2467 if self._downloader.params.get('listformats', None):
2468 self._print_formats(formats)
2471 req_format = self._downloader.params.get('format', None)
2472 self.to_screen(u'Format: %s' % req_format)
2474 if req_format is None or req_format == 'best':
2476 elif req_format == 'worst':
2477 return [formats[-1]]
2478 elif req_format in ('-1', 'all'):
2481 format = self._specific( req_format, formats )
2483 raise ExtractorError(u'Requested format not available')
2488 class PornotubeIE(InfoExtractor):
2489 """Information extractor for pornotube.com."""
2490 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2492 def _real_extract(self, url):
2493 mobj = re.match(self._VALID_URL, url)
2495 raise ExtractorError(u'Invalid URL: %s' % url)
2497 video_id = mobj.group('videoid')
2498 video_title = mobj.group('title')
2500 # Get webpage content
2501 webpage = self._download_webpage(url, video_id)
2504 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2505 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2506 video_url = compat_urllib_parse.unquote(video_url)
2508 #Get the uploaded date
2509 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2510 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2511 if upload_date: upload_date = unified_strdate(upload_date)
2513 info = {'id': video_id,
2516 'upload_date': upload_date,
2517 'title': video_title,
2523 class YouJizzIE(InfoExtractor):
2524 """Information extractor for youjizz.com."""
2525 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2527 def _real_extract(self, url):
2528 mobj = re.match(self._VALID_URL, url)
2530 raise ExtractorError(u'Invalid URL: %s' % url)
2532 video_id = mobj.group('videoid')
2534 # Get webpage content
2535 webpage = self._download_webpage(url, video_id)
2537 # Get the video title
2538 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2539 webpage, u'title').strip()
2541 # Get the embed page
2542 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2544 raise ExtractorError(u'ERROR: unable to extract embed page')
2546 embed_page_url = result.group(0).strip()
2547 video_id = result.group('videoid')
2549 webpage = self._download_webpage(embed_page_url, video_id)
2552 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2553 webpage, u'video URL')
2555 info = {'id': video_id,
2557 'title': video_title,
2560 'player_url': embed_page_url}
2564 class EightTracksIE(InfoExtractor):
2566 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2568 def _real_extract(self, url):
2569 mobj = re.match(self._VALID_URL, url)
2571 raise ExtractorError(u'Invalid URL: %s' % url)
2572 playlist_id = mobj.group('id')
2574 webpage = self._download_webpage(url, playlist_id)
2576 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2577 data = json.loads(json_like)
2579 session = str(random.randint(0, 1000000000))
2581 track_count = data['tracks_count']
2582 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2583 next_url = first_url
2585 for i in itertools.count():
2586 api_json = self._download_webpage(next_url, playlist_id,
2587 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2588 errnote=u'Failed to download song information')
2589 api_data = json.loads(api_json)
2590 track_data = api_data[u'set']['track']
2592 'id': track_data['id'],
2593 'url': track_data['track_file_stream_url'],
2594 'title': track_data['performer'] + u' - ' + track_data['name'],
2595 'raw_title': track_data['name'],
2596 'uploader_id': data['user']['login'],
2600 if api_data['set']['at_last_track']:
2602 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2605 class KeekIE(InfoExtractor):
2606 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2609 def _real_extract(self, url):
2610 m = re.match(self._VALID_URL, url)
2611 video_id = m.group('videoID')
2613 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2614 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2615 webpage = self._download_webpage(url, video_id)
2617 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2620 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2621 webpage, u'uploader', fatal=False)
2627 'title': video_title,
2628 'thumbnail': thumbnail,
2629 'uploader': uploader
2633 class TEDIE(InfoExtractor):
2634 _VALID_URL=r'''http://www\.ted\.com/
2636 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2638 ((?P<type_talk>talks)) # We have a simple talk
2640 (/lang/(.*?))? # The url may contain the language
2641 /(?P<name>\w+) # Here goes the name and then ".html"
2645 def suitable(cls, url):
2646 """Receives a URL and returns True if suitable for this IE."""
2647 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2649 def _real_extract(self, url):
2650 m=re.match(self._VALID_URL, url, re.VERBOSE)
2651 if m.group('type_talk'):
2652 return [self._talk_info(url)]
2654 playlist_id=m.group('playlist_id')
2655 name=m.group('name')
2656 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2657 return [self._playlist_videos_info(url,name,playlist_id)]
2659 def _playlist_videos_info(self,url,name,playlist_id=0):
2660 '''Returns the videos of the playlist'''
2662 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2663 ([.\s]*?)data-playlist_item_id="(\d+)"
2664 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2666 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2667 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2668 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2669 m_names=re.finditer(video_name_RE,webpage)
2671 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2672 webpage, 'playlist title')
2674 playlist_entries = []
2675 for m_video, m_name in zip(m_videos,m_names):
2676 video_id=m_video.group('video_id')
2677 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2678 playlist_entries.append(self.url_result(talk_url, 'TED'))
2679 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2681 def _talk_info(self, url, video_id=0):
2682 """Return the video for the talk in the url"""
2683 m = re.match(self._VALID_URL, url,re.VERBOSE)
2684 video_name = m.group('name')
2685 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2686 self.report_extraction(video_name)
2687 # If the url includes the language we get the title translated
2688 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2690 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2691 webpage, 'json data')
2692 info = json.loads(json_data)
2693 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2694 webpage, 'description', flags = re.DOTALL)
2696 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2697 webpage, 'thumbnail')
2700 'url': info['htmlStreams'][-1]['file'],
2703 'thumbnail': thumbnail,
2704 'description': desc,
2708 class MySpassIE(InfoExtractor):
2709 _VALID_URL = r'http://www.myspass.de/.*'
2711 def _real_extract(self, url):
2712 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2714 # video id is the last path element of the URL
2715 # usually there is a trailing slash, so also try the second but last
2716 url_path = compat_urllib_parse_urlparse(url).path
2717 url_parent_path, video_id = os.path.split(url_path)
2719 _, video_id = os.path.split(url_parent_path)
2722 metadata_url = META_DATA_URL_TEMPLATE % video_id
2723 metadata_text = self._download_webpage(metadata_url, video_id)
2724 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2726 # extract values from metadata
2727 url_flv_el = metadata.find('url_flv')
2728 if url_flv_el is None:
2729 raise ExtractorError(u'Unable to extract download url')
2730 video_url = url_flv_el.text
2731 extension = os.path.splitext(video_url)[1][1:]
2732 title_el = metadata.find('title')
2733 if title_el is None:
2734 raise ExtractorError(u'Unable to extract title')
2735 title = title_el.text
2736 format_id_el = metadata.find('format_id')
2737 if format_id_el is None:
2740 format = format_id_el.text
2741 description_el = metadata.find('description')
2742 if description_el is not None:
2743 description = description_el.text
2746 imagePreview_el = metadata.find('imagePreview')
2747 if imagePreview_el is not None:
2748 thumbnail = imagePreview_el.text
2757 'thumbnail': thumbnail,
2758 'description': description
2762 class SpiegelIE(InfoExtractor):
2763 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2765 def _real_extract(self, url):
2766 m = re.match(self._VALID_URL, url)
2767 video_id = m.group('videoID')
2769 webpage = self._download_webpage(url, video_id)
2771 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2774 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2775 xml_code = self._download_webpage(xml_url, video_id,
2776 note=u'Downloading XML', errnote=u'Failed to download XML')
2778 idoc = xml.etree.ElementTree.fromstring(xml_code)
2779 last_type = idoc[-1]
2780 filename = last_type.findall('./filename')[0].text
2781 duration = float(last_type.findall('./duration')[0].text)
2783 video_url = 'http://video2.spiegel.de/flash/' + filename
2784 video_ext = filename.rpartition('.')[2]
2789 'title': video_title,
2790 'duration': duration,
2794 class LiveLeakIE(InfoExtractor):
2796 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2797 IE_NAME = u'liveleak'
2799 def _real_extract(self, url):
2800 mobj = re.match(self._VALID_URL, url)
2802 raise ExtractorError(u'Invalid URL: %s' % url)
2804 video_id = mobj.group('video_id')
2806 webpage = self._download_webpage(url, video_id)
2808 video_url = self._search_regex(r'file: "(.*?)",',
2809 webpage, u'video URL')
2811 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2812 webpage, u'title').replace('LiveLeak.com -', '').strip()
2814 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2815 webpage, u'description', fatal=False)
2817 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2818 webpage, u'uploader', fatal=False)
2824 'title': video_title,
2825 'description': video_description,
2826 'uploader': video_uploader
2831 class ARDIE(InfoExtractor):
2832 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2833 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2834 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2836 def _real_extract(self, url):
2837 # determine video id from url
2838 m = re.match(self._VALID_URL, url)
2840 numid = re.search(r'documentId=([0-9]+)', url)
2842 video_id = numid.group(1)
2844 video_id = m.group('video_id')
2846 # determine title and media streams from webpage
2847 html = self._download_webpage(url, video_id)
2848 title = re.search(self._TITLE, html).group('title')
2849 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2851 assert '"fsk"' in html
2852 raise ExtractorError(u'This video is only available after 8:00 pm')
2854 # choose default media type and highest quality for now
2855 stream = max([s for s in streams if int(s["media_type"]) == 0],
2856 key=lambda s: int(s["quality"]))
2858 # there's two possibilities: RTMP stream or HTTP download
2859 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2860 if stream['rtmp_url']:
2861 self.to_screen(u'RTMP download detected')
2862 assert stream['video_url'].startswith('mp4:')
2863 info["url"] = stream["rtmp_url"]
2864 info["play_path"] = stream['video_url']
2866 assert stream["video_url"].endswith('.mp4')
2867 info["url"] = stream["video_url"]
2870 class ZDFIE(InfoExtractor):
2871 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2872 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2873 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2874 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2875 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2877 def _real_extract(self, url):
2878 mobj = re.match(self._VALID_URL, url)
2880 raise ExtractorError(u'Invalid URL: %s' % url)
2881 video_id = mobj.group('video_id')
2883 html = self._download_webpage(url, video_id)
2884 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2886 raise ExtractorError(u'No media url found.')
2888 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2889 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2890 # choose first/default media type and highest quality for now
2891 for s in streams: #find 300 - dsl1000mbit
2892 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2895 for s in streams: #find veryhigh - dsl2000mbit
2896 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2900 raise ExtractorError(u'No stream found.')
2902 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2904 self.report_extraction(video_id)
2905 mobj = re.search(self._TITLE, html)
2907 raise ExtractorError(u'Cannot extract title')
2908 title = unescapeHTML(mobj.group('title'))
2910 mobj = re.search(self._MMS_STREAM, media_link)
2912 mobj = re.search(self._RTSP_STREAM, media_link)
2914 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2915 mms_url = mobj.group('video_url')
2917 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2919 raise ExtractorError(u'Cannot extract extention')
2920 ext = mobj.group('ext')
2922 return [{'id': video_id,
2928 class TumblrIE(InfoExtractor):
2929 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2931 def _real_extract(self, url):
2932 m_url = re.match(self._VALID_URL, url)
2933 video_id = m_url.group('id')
2934 blog = m_url.group('blog_name')
2936 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2937 webpage = self._download_webpage(url, video_id)
2939 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2940 video = re.search(re_video, webpage)
2942 raise ExtractorError(u'Unable to extract video')
2943 video_url = video.group('video_url')
2944 ext = video.group('ext')
2946 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2947 webpage, u'thumbnail', fatal=False) # We pick the first poster
2948 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2950 # The only place where you can get a title, it's not complete,
2951 # but searching in other places doesn't work for all videos
2952 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2953 webpage, u'title', flags=re.DOTALL)
2955 return [{'id': video_id,
2957 'title': video_title,
2958 'thumbnail': video_thumbnail,
2962 class BandcampIE(InfoExtractor):
2963 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2965 def _real_extract(self, url):
2966 mobj = re.match(self._VALID_URL, url)
2967 title = mobj.group('title')
2968 webpage = self._download_webpage(url, title)
2969 # We get the link to the free download page
2970 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2971 if m_download is None:
2972 raise ExtractorError(u'No free songs found')
2974 download_link = m_download.group(1)
2975 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2976 webpage, re.MULTILINE|re.DOTALL).group('id')
2978 download_webpage = self._download_webpage(download_link, id,
2979 'Downloading free downloads page')
2980 # We get the dictionary of the track from some javascrip code
2981 info = re.search(r'items: (.*?),$',
2982 download_webpage, re.MULTILINE).group(1)
2983 info = json.loads(info)[0]
2984 # We pick mp3-320 for now, until format selection can be easily implemented.
2985 mp3_info = info[u'downloads'][u'mp3-320']
2986 # If we try to use this url it says the link has expired
2987 initial_url = mp3_info[u'url']
2988 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2989 m_url = re.match(re_url, initial_url)
2990 #We build the url we will use to get the final track url
2991 # This url is build in Bandcamp in the script download_bunde_*.js
2992 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2993 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2994 # If we could correctly generate the .rand field the url would be
2995 #in the "download_url" key
2996 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2998 track_info = {'id':id,
2999 'title' : info[u'title'],
3002 'thumbnail' : info[u'thumb_url'],
3003 'uploader' : info[u'artist']
3008 class RedTubeIE(InfoExtractor):
3009 """Information Extractor for redtube"""
3010 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3012 def _real_extract(self,url):
3013 mobj = re.match(self._VALID_URL, url)
3015 raise ExtractorError(u'Invalid URL: %s' % url)
3017 video_id = mobj.group('id')
3018 video_extension = 'mp4'
3019 webpage = self._download_webpage(url, video_id)
3021 self.report_extraction(video_id)
3023 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3024 webpage, u'video URL')
3026 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3032 'ext': video_extension,
3033 'title': video_title,
3036 class InaIE(InfoExtractor):
3037 """Information Extractor for Ina.fr"""
3038 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3040 def _real_extract(self,url):
3041 mobj = re.match(self._VALID_URL, url)
3043 video_id = mobj.group('id')
3044 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3045 video_extension = 'mp4'
3046 webpage = self._download_webpage(mrss_url, video_id)
3048 self.report_extraction(video_id)
3050 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
3051 webpage, u'video URL')
3053 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
3059 'ext': video_extension,
3060 'title': video_title,
3063 class HowcastIE(InfoExtractor):
3064 """Information Extractor for Howcast.com"""
3065 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
3067 def _real_extract(self, url):
3068 mobj = re.match(self._VALID_URL, url)
3070 video_id = mobj.group('id')
3071 webpage_url = 'http://www.howcast.com/videos/' + video_id
3072 webpage = self._download_webpage(webpage_url, video_id)
3074 self.report_extraction(video_id)
3076 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3077 webpage, u'video URL')
3079 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3082 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3083 webpage, u'description', fatal=False)
3085 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3086 webpage, u'thumbnail', fatal=False)
3092 'title': video_title,
3093 'description': video_description,
3094 'thumbnail': thumbnail,
3097 class VineIE(InfoExtractor):
3098 """Information Extractor for Vine.co"""
3099 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3104 video_id = mobj.group('id')
3105 webpage_url = 'https://vine.co/v/' + video_id
3106 webpage = self._download_webpage(webpage_url, video_id)
3108 self.report_extraction(video_id)
3110 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3111 webpage, u'video URL')
3113 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3116 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3117 webpage, u'thumbnail', fatal=False)
3119 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3120 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3126 'title': video_title,
3127 'thumbnail': thumbnail,
3128 'uploader': uploader,
3131 class FlickrIE(InfoExtractor):
3132 """Information Extractor for Flickr videos"""
3133 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3135 def _real_extract(self, url):
3136 mobj = re.match(self._VALID_URL, url)
3138 video_id = mobj.group('id')
3139 video_uploader_id = mobj.group('uploader_id')
3140 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3141 webpage = self._download_webpage(webpage_url, video_id)
3143 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3145 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3146 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3148 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3149 first_xml, u'node_id')
3151 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3152 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3154 self.report_extraction(video_id)
3156 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3158 raise ExtractorError(u'Unable to extract video url')
3159 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3161 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3162 webpage, u'video title')
3164 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3165 webpage, u'description', fatal=False)
3167 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3168 webpage, u'thumbnail', fatal=False)
3174 'title': video_title,
3175 'description': video_description,
3176 'thumbnail': thumbnail,
3177 'uploader_id': video_uploader_id,
3180 class TeamcocoIE(InfoExtractor):
3181 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3183 def _real_extract(self, url):
3184 mobj = re.match(self._VALID_URL, url)
3186 raise ExtractorError(u'Invalid URL: %s' % url)
3187 url_title = mobj.group('url_title')
3188 webpage = self._download_webpage(url, url_title)
3190 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3191 webpage, u'video id')
3193 self.report_extraction(video_id)
3195 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3198 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3199 webpage, u'thumbnail', fatal=False)
3201 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3202 webpage, u'description', fatal=False)
3204 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3205 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3207 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3214 'title': video_title,
3215 'thumbnail': thumbnail,
3216 'description': video_description,
3219 class XHamsterIE(InfoExtractor):
3220 """Information Extractor for xHamster"""
3221 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3223 def _real_extract(self,url):
3224 mobj = re.match(self._VALID_URL, url)
3226 video_id = mobj.group('id')
3227 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3228 webpage = self._download_webpage(mrss_url, video_id)
3230 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3232 raise ExtractorError(u'Unable to extract media URL')
3233 if len(mobj.group('server')) == 0:
3234 video_url = compat_urllib_parse.unquote(mobj.group('file'))
3236 video_url = mobj.group('server')+'/key='+mobj.group('file')
3237 video_extension = video_url.split('.')[-1]
3239 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3242 # Can't see the description anywhere in the UI
3243 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3244 # webpage, u'description', fatal=False)
3245 # if video_description: video_description = unescapeHTML(video_description)
3247 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3249 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3251 video_upload_date = None
3252 self._downloader.report_warning(u'Unable to extract upload date')
3254 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3255 webpage, u'uploader id', default=u'anonymous')
3257 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3258 webpage, u'thumbnail', fatal=False)
3263 'ext': video_extension,
3264 'title': video_title,
3265 # 'description': video_description,
3266 'upload_date': video_upload_date,
3267 'uploader_id': video_uploader_id,
3268 'thumbnail': video_thumbnail
3271 class HypemIE(InfoExtractor):
3272 """Information Extractor for hypem"""
3273 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3275 def _real_extract(self, url):
3276 mobj = re.match(self._VALID_URL, url)
3278 raise ExtractorError(u'Invalid URL: %s' % url)
3279 track_id = mobj.group(1)
3281 data = { 'ax': 1, 'ts': time.time() }
3282 data_encoded = compat_urllib_parse.urlencode(data)
3283 complete_url = url + "?" + data_encoded
3284 request = compat_urllib_request.Request(complete_url)
3285 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3286 cookie = urlh.headers.get('Set-Cookie', '')
3288 self.report_extraction(track_id)
3290 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3291 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3293 track_list = json.loads(html_tracks)
3294 track = track_list[u'tracks'][0]
3296 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3299 track_id = track[u"id"]
3300 artist = track[u"artist"]
3301 title = track[u"song"]
3303 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3304 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3305 request.add_header('cookie', cookie)
3306 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3308 song_data = json.loads(song_data_json)
3310 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3311 final_url = song_data[u"url"]
3321 class Vbox7IE(InfoExtractor):
3322 """Information Extractor for Vbox7"""
3323 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3325 def _real_extract(self,url):
3326 mobj = re.match(self._VALID_URL, url)
3328 raise ExtractorError(u'Invalid URL: %s' % url)
3329 video_id = mobj.group(1)
3331 redirect_page, urlh = self._download_webpage_handle(url, video_id)
3332 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3333 redirect_url = urlh.geturl() + new_location
3334 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3336 title = self._html_search_regex(r'<title>(.*)</title>',
3337 webpage, u'title').split('/')[0].strip()
3340 info_url = "http://vbox7.com/play/magare.do"
3341 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3342 info_request = compat_urllib_request.Request(info_url, data)
3343 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3344 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3345 if info_response is None:
3346 raise ExtractorError(u'Unable to extract the media url')
3347 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3354 'thumbnail': thumbnail_url,
3357 class GametrailersIE(InfoExtractor):
3358 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3360 def _real_extract(self, url):
3361 mobj = re.match(self._VALID_URL, url)
3363 raise ExtractorError(u'Invalid URL: %s' % url)
3364 video_id = mobj.group('id')
3365 video_type = mobj.group('type')
3366 webpage = self._download_webpage(url, video_id)
3367 if video_type == 'full-episodes':
3368 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3370 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3371 mgid = self._search_regex(mgid_re, webpage, u'mgid')
3372 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3374 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3375 video_id, u'Downloading video info')
3376 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3377 video_id, u'Downloading video urls info')
3379 self.report_extraction(video_id)
3380 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3381 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3383 <url>(?P<thumb>.*?)</url>.*
3386 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3388 raise ExtractorError(u'Unable to extract video info')
3389 video_title = m_info.group('title')
3390 video_description = m_info.group('description')
3391 video_thumb = m_info.group('thumb')
3393 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3394 if m_urls is None or len(m_urls) == 0:
3395 raise ExtractError(u'Unable to extrat video url')
3396 # They are sorted from worst to best quality
3397 video_url = m_urls[-1].group('url')
3399 return {'url': video_url,
3401 'title': video_title,
3402 # Videos are actually flv not mp4
3404 'thumbnail': video_thumb,
3405 'description': video_description,
3408 def gen_extractors():
3409 """ Return a list of an instance of every supported extractor.
3410 The order does matter; the first extractor matched is the one handling the URL.
3413 YoutubePlaylistIE(),
3438 StanfordOpenClassroomIE(),
3448 WorldStarHipHopIE(),
3478 def get_info_extractor(ie_name):
3479 """Returns the info extractor class with the given ie_name"""
3480 return globals()[ie_name+'IE']