2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 from .extractor.common import InfoExtractor, SearchInfoExtractor
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.metacafe import MetacafeIE
29 from .extractor.statigram import StatigramIE
30 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
36 class PhotobucketIE(InfoExtractor):
37 """Information extractor for photobucket.com."""
39 # TODO: the original _VALID_URL was:
40 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
41 # Check if it's necessary to keep the old extracion process
42 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
43 IE_NAME = u'photobucket'
45 def _real_extract(self, url):
47 mobj = re.match(self._VALID_URL, url)
49 raise ExtractorError(u'Invalid URL: %s' % url)
51 video_id = mobj.group('id')
53 video_extension = mobj.group('ext')
55 # Retrieve video webpage to extract further information
56 webpage = self._download_webpage(url, video_id)
58 # Extract URL, uploader, and title from webpage
59 self.report_extraction(video_id)
60 # We try first by looking the javascript code:
61 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
63 info = json.loads(mobj.group('json'))
66 'url': info[u'downloadUrl'],
67 'uploader': info[u'username'],
68 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
69 'title': info[u'title'],
70 'ext': video_extension,
71 'thumbnail': info[u'thumbUrl'],
74 # We try looking in other parts of the webpage
75 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
76 webpage, u'video URL')
78 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
80 raise ExtractorError(u'Unable to extract title')
81 video_title = mobj.group(1).decode('utf-8')
82 video_uploader = mobj.group(2).decode('utf-8')
85 'id': video_id.decode('utf-8'),
86 'url': video_url.decode('utf-8'),
87 'uploader': video_uploader,
90 'ext': video_extension.decode('utf-8'),
94 class YahooIE(InfoExtractor):
95 """Information extractor for screen.yahoo.com."""
96 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
98 def _real_extract(self, url):
99 mobj = re.match(self._VALID_URL, url)
101 raise ExtractorError(u'Invalid URL: %s' % url)
102 video_id = mobj.group('id')
103 webpage = self._download_webpage(url, video_id)
104 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
107 # TODO: Check which url parameters are required
108 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
109 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
110 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
111 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
112 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
113 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
115 self.report_extraction(video_id)
116 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
118 raise ExtractorError(u'Unable to extract video info')
119 video_title = m_info.group('title')
120 video_description = m_info.group('description')
121 video_thumb = m_info.group('thumb')
122 video_date = m_info.group('date')
123 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
125 # TODO: Find a way to get mp4 videos
126 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
127 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
128 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
129 video_url = m_rest.group('url')
130 video_path = m_rest.group('path')
132 raise ExtractorError(u'Unable to extract video url')
134 else: # We have to use a different method if another id is defined
135 long_id = m_id.group('new_id')
136 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
137 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
138 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
139 info = json.loads(json_str)
140 res = info[u'query'][u'results'][u'mediaObj'][0]
141 stream = res[u'streams'][0]
142 video_path = stream[u'path']
143 video_url = stream[u'host']
145 video_title = meta[u'title']
146 video_description = meta[u'description']
147 video_thumb = meta[u'thumbnail']
148 video_date = None # I can't find it
153 'play_path': video_path,
155 'description': video_description,
156 'thumbnail': video_thumb,
157 'upload_date': video_date,
162 class VimeoIE(InfoExtractor):
163 """Information extractor for vimeo.com."""
165 # _VALID_URL matches Vimeo URLs
166 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
169 def _verify_video_password(self, url, video_id, webpage):
170 password = self._downloader.params.get('password', None)
172 raise ExtractorError(u'This video is protected by a password, use the --password option')
173 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
174 data = compat_urllib_parse.urlencode({'password': password,
176 # I didn't manage to use the password with https
177 if url.startswith('https'):
178 pass_url = url.replace('https','http')
181 password_request = compat_urllib_request.Request(pass_url+'/password', data)
182 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
183 password_request.add_header('Cookie', 'xsrft=%s' % token)
184 pass_web = self._download_webpage(password_request, video_id,
185 u'Verifying the password',
188 def _real_extract(self, url, new_video=True):
189 # Extract ID from URL
190 mobj = re.match(self._VALID_URL, url)
192 raise ExtractorError(u'Invalid URL: %s' % url)
194 video_id = mobj.group('id')
195 if not mobj.group('proto'):
196 url = 'https://' + url
197 if mobj.group('direct_link') or mobj.group('pro'):
198 url = 'https://vimeo.com/' + video_id
200 # Retrieve video webpage to extract further information
201 request = compat_urllib_request.Request(url, None, std_headers)
202 webpage = self._download_webpage(request, video_id)
204 # Now we begin extracting as much information as we can from what we
205 # retrieved. First we extract the information common to all extractors,
206 # and latter we extract those that are Vimeo specific.
207 self.report_extraction(video_id)
209 # Extract the config JSON
211 config = webpage.split(' = {config:')[1].split(',assets:')[0]
212 config = json.loads(config)
214 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
215 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
217 if re.search('If so please provide the correct password.', webpage):
218 self._verify_video_password(url, video_id, webpage)
219 return self._real_extract(url)
221 raise ExtractorError(u'Unable to extract info section')
224 video_title = config["video"]["title"]
226 # Extract uploader and uploader_id
227 video_uploader = config["video"]["owner"]["name"]
228 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
230 # Extract video thumbnail
231 video_thumbnail = config["video"]["thumbnail"]
233 # Extract video description
234 video_description = get_element_by_attribute("itemprop", "description", webpage)
235 if video_description: video_description = clean_html(video_description)
236 else: video_description = u''
238 # Extract upload date
239 video_upload_date = None
240 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
242 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
244 # Vimeo specific: extract request signature and timestamp
245 sig = config['request']['signature']
246 timestamp = config['request']['timestamp']
248 # Vimeo specific: extract video codec and quality information
249 # First consider quality, then codecs, then take everything
250 # TODO bind to format param
251 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
252 files = { 'hd': [], 'sd': [], 'other': []}
253 for codec_name, codec_extension in codecs:
254 if codec_name in config["video"]["files"]:
255 if 'hd' in config["video"]["files"][codec_name]:
256 files['hd'].append((codec_name, codec_extension, 'hd'))
257 elif 'sd' in config["video"]["files"][codec_name]:
258 files['sd'].append((codec_name, codec_extension, 'sd'))
260 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
262 for quality in ('hd', 'sd', 'other'):
263 if len(files[quality]) > 0:
264 video_quality = files[quality][0][2]
265 video_codec = files[quality][0][0]
266 video_extension = files[quality][0][1]
267 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
270 raise ExtractorError(u'No known codec found')
272 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
273 %(video_id, sig, timestamp, video_quality, video_codec.upper())
278 'uploader': video_uploader,
279 'uploader_id': video_uploader_id,
280 'upload_date': video_upload_date,
281 'title': video_title,
282 'ext': video_extension,
283 'thumbnail': video_thumbnail,
284 'description': video_description,
288 class ArteTvIE(InfoExtractor):
289 """arte.tv information extractor."""
291 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
292 _LIVE_URL = r'index-[0-9]+\.html$'
296 def fetch_webpage(self, url):
297 request = compat_urllib_request.Request(url)
299 self.report_download_webpage(url)
300 webpage = compat_urllib_request.urlopen(request).read()
301 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
302 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
303 except ValueError as err:
304 raise ExtractorError(u'Invalid URL: %s' % url)
307 def grep_webpage(self, url, regex, regexFlags, matchTuples):
308 page = self.fetch_webpage(url)
309 mobj = re.search(regex, page, regexFlags)
313 raise ExtractorError(u'Invalid URL: %s' % url)
315 for (i, key, err) in matchTuples:
316 if mobj.group(i) is None:
317 raise ExtractorError(err)
319 info[key] = mobj.group(i)
323 def extractLiveStream(self, url):
324 video_lang = url.split('/')[-4]
325 info = self.grep_webpage(
327 r'src="(.*?/videothek_js.*?\.js)',
330 (1, 'url', u'Invalid URL: %s' % url)
333 http_host = url.split('/')[2]
334 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
335 info = self.grep_webpage(
337 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
338 '(http://.*?\.swf).*?' +
342 (1, 'path', u'could not extract video path: %s' % url),
343 (2, 'player', u'could not extract video player: %s' % url),
344 (3, 'url', u'could not extract video url: %s' % url)
347 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
349 def extractPlus7Stream(self, url):
350 video_lang = url.split('/')[-3]
351 info = self.grep_webpage(
353 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
356 (1, 'url', u'Invalid URL: %s' % url)
359 next_url = compat_urllib_parse.unquote(info.get('url'))
360 info = self.grep_webpage(
362 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
365 (1, 'url', u'Could not find <video> tag: %s' % url)
368 next_url = compat_urllib_parse.unquote(info.get('url'))
370 info = self.grep_webpage(
372 r'<video id="(.*?)".*?>.*?' +
373 '<name>(.*?)</name>.*?' +
374 '<dateVideo>(.*?)</dateVideo>.*?' +
375 '<url quality="hd">(.*?)</url>',
378 (1, 'id', u'could not extract video id: %s' % url),
379 (2, 'title', u'could not extract video title: %s' % url),
380 (3, 'date', u'could not extract video date: %s' % url),
381 (4, 'url', u'could not extract video url: %s' % url)
386 'id': info.get('id'),
387 'url': compat_urllib_parse.unquote(info.get('url')),
388 'uploader': u'arte.tv',
389 'upload_date': unified_strdate(info.get('date')),
390 'title': info.get('title').decode('utf-8'),
396 def _real_extract(self, url):
397 video_id = url.split('/')[-1]
398 self.report_extraction(video_id)
400 if re.search(self._LIVE_URL, video_id) is not None:
401 self.extractLiveStream(url)
404 info = self.extractPlus7Stream(url)
409 class GenericIE(InfoExtractor):
410 """Generic last-resort information extractor."""
415 def report_download_webpage(self, video_id):
416 """Report webpage download."""
417 if not self._downloader.params.get('test', False):
418 self._downloader.report_warning(u'Falling back on generic information extractor.')
419 super(GenericIE, self).report_download_webpage(video_id)
421 def report_following_redirect(self, new_url):
422 """Report information extraction."""
423 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
425 def _test_redirect(self, url):
426 """Check if it is a redirect, like url shorteners, in case return the new url."""
427 class HeadRequest(compat_urllib_request.Request):
428 def get_method(self):
431 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
433 Subclass the HTTPRedirectHandler to make it use our
434 HeadRequest also on the redirected URL
436 def redirect_request(self, req, fp, code, msg, headers, newurl):
437 if code in (301, 302, 303, 307):
438 newurl = newurl.replace(' ', '%20')
439 newheaders = dict((k,v) for k,v in req.headers.items()
440 if k.lower() not in ("content-length", "content-type"))
441 return HeadRequest(newurl,
443 origin_req_host=req.get_origin_req_host(),
446 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
448 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
450 Fallback to GET if HEAD is not allowed (405 HTTP error)
452 def http_error_405(self, req, fp, code, msg, headers):
456 newheaders = dict((k,v) for k,v in req.headers.items()
457 if k.lower() not in ("content-length", "content-type"))
458 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
460 origin_req_host=req.get_origin_req_host(),
464 opener = compat_urllib_request.OpenerDirector()
465 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
466 HTTPMethodFallback, HEADRedirectHandler,
467 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
468 opener.add_handler(handler())
470 response = opener.open(HeadRequest(url))
472 raise ExtractorError(u'Invalid URL protocol')
473 new_url = response.geturl()
478 self.report_following_redirect(new_url)
481 def _real_extract(self, url):
482 new_url = self._test_redirect(url)
483 if new_url: return [self.url_result(new_url)]
485 video_id = url.split('/')[-1]
487 webpage = self._download_webpage(url, video_id)
488 except ValueError as err:
489 # since this is the last-resort InfoExtractor, if
490 # this error is thrown, it'll be thrown here
491 raise ExtractorError(u'Invalid URL: %s' % url)
493 self.report_extraction(video_id)
494 # Start with something easy: JW Player in SWFObject
495 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
497 # Broaden the search a little bit
498 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
500 # Broaden the search a little bit: JWPlayer JS loader
501 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
503 # Try to find twitter cards info
504 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
506 # We look for Open Graph info:
507 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
508 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
509 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
510 if m_video_type is not None:
511 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
513 raise ExtractorError(u'Invalid URL: %s' % url)
515 # It's possible that one of the regexes
516 # matched, but returned an empty group:
517 if mobj.group(1) is None:
518 raise ExtractorError(u'Invalid URL: %s' % url)
520 video_url = compat_urllib_parse.unquote(mobj.group(1))
521 video_id = os.path.basename(video_url)
523 # here's a fun little line of code for you:
524 video_extension = os.path.splitext(video_id)[1][1:]
525 video_id = os.path.splitext(video_id)[0]
527 # it's tempting to parse this further, but you would
528 # have to take into account all the variations like
529 # Video Title - Site Name
530 # Site Name | Video Title
531 # Video Title - Tagline | Site Name
532 # and so on and so forth; it's just not practical
533 video_title = self._html_search_regex(r'<title>(.*)</title>',
534 webpage, u'video title')
536 # video uploader is domain name
537 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
538 url, u'video uploader')
543 'uploader': video_uploader,
545 'title': video_title,
546 'ext': video_extension,
550 class YoutubeSearchIE(SearchInfoExtractor):
551 """Information Extractor for YouTube search queries."""
552 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
554 IE_NAME = u'youtube:search'
555 _SEARCH_KEY = 'ytsearch'
557 def report_download_page(self, query, pagenum):
558 """Report attempt to download search page with given number."""
559 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
561 def _get_n_results(self, query, n):
562 """Get a specified number of results for a query"""
568 while (50 * pagenum) < limit:
569 self.report_download_page(query, pagenum+1)
570 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
571 request = compat_urllib_request.Request(result_url)
573 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
574 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
575 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
576 api_response = json.loads(data)['data']
578 if not 'items' in api_response:
579 raise ExtractorError(u'[youtube] No video results')
581 new_ids = list(video['id'] for video in api_response['items'])
584 limit = min(n, api_response['totalItems'])
587 if len(video_ids) > n:
588 video_ids = video_ids[:n]
589 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
590 return self.playlist_result(videos, query)
593 class GoogleSearchIE(SearchInfoExtractor):
594 """Information Extractor for Google Video search queries."""
595 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
597 IE_NAME = u'video.google:search'
598 _SEARCH_KEY = 'gvsearch'
600 def _get_n_results(self, query, n):
601 """Get a specified number of results for a query"""
609 for pagenum in itertools.count(1):
610 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
611 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
612 note='Downloading result page ' + str(pagenum))
614 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
619 res['entries'].append(e)
621 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
624 class YahooSearchIE(SearchInfoExtractor):
625 """Information Extractor for Yahoo! Video search queries."""
628 IE_NAME = u'screen.yahoo:search'
629 _SEARCH_KEY = 'yvsearch'
631 def _get_n_results(self, query, n):
632 """Get a specified number of results for a query"""
639 for pagenum in itertools.count(0):
640 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
641 webpage = self._download_webpage(result_url, query,
642 note='Downloading results page '+str(pagenum+1))
643 info = json.loads(webpage)
645 results = info[u'results']
647 for (i, r) in enumerate(results):
648 if (pagenum * 30) +i >= n:
650 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
651 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
652 res['entries'].append(e)
653 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
659 class BlipTVUserIE(InfoExtractor):
660 """Information Extractor for blip.tv users."""
662 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
664 IE_NAME = u'blip.tv:user'
666 def _real_extract(self, url):
668 mobj = re.match(self._VALID_URL, url)
670 raise ExtractorError(u'Invalid URL: %s' % url)
672 username = mobj.group(1)
674 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
676 page = self._download_webpage(url, username, u'Downloading user page')
677 mobj = re.search(r'data-users-id="([^"]+)"', page)
678 page_base = page_base % mobj.group(1)
681 # Download video ids using BlipTV Ajax calls. Result size per
682 # query is limited (currently to 12 videos) so we need to query
683 # page by page until there are no video ids - it means we got
690 url = page_base + "&page=" + str(pagenum)
691 page = self._download_webpage(url, username,
692 u'Downloading video ids from page %d' % pagenum)
694 # Extract video identifiers
697 for mobj in re.finditer(r'href="/([^"]+)"', page):
698 if mobj.group(1) not in ids_in_page:
699 ids_in_page.append(unescapeHTML(mobj.group(1)))
701 video_ids.extend(ids_in_page)
703 # A little optimization - if current page is not
704 # "full", ie. does not contain PAGE_SIZE video ids then
705 # we can assume that this page is the last one - there
706 # are no more ids on further pages - no need to query
709 if len(ids_in_page) < self._PAGE_SIZE:
714 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
715 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
716 return [self.playlist_result(url_entries, playlist_title = username)]
719 class DepositFilesIE(InfoExtractor):
720 """Information extractor for depositfiles.com"""
722 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
724 def _real_extract(self, url):
725 file_id = url.split('/')[-1]
726 # Rebuild url in english locale
727 url = 'http://depositfiles.com/en/files/' + file_id
729 # Retrieve file webpage with 'Free download' button pressed
730 free_download_indication = { 'gateway_result' : '1' }
731 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
733 self.report_download_webpage(file_id)
734 webpage = compat_urllib_request.urlopen(request).read()
735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
736 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
738 # Search for the real file URL
739 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
740 if (mobj is None) or (mobj.group(1) is None):
741 # Try to figure out reason of the error.
742 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
743 if (mobj is not None) and (mobj.group(1) is not None):
744 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
745 raise ExtractorError(u'%s' % restriction_message)
747 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
749 file_url = mobj.group(1)
750 file_extension = os.path.splitext(file_url)[1][1:]
752 # Search for file title
753 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
756 'id': file_id.decode('utf-8'),
757 'url': file_url.decode('utf-8'),
761 'ext': file_extension.decode('utf-8'),
765 class FacebookIE(InfoExtractor):
766 """Information Extractor for Facebook"""
768 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
769 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
770 _NETRC_MACHINE = 'facebook'
771 IE_NAME = u'facebook'
773 def report_login(self):
774 """Report attempt to log in."""
775 self.to_screen(u'Logging in')
777 def _real_initialize(self):
778 if self._downloader is None:
783 downloader_params = self._downloader.params
785 # Attempt to use provided username and password or .netrc data
786 if downloader_params.get('username', None) is not None:
787 useremail = downloader_params['username']
788 password = downloader_params['password']
789 elif downloader_params.get('usenetrc', False):
791 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
796 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
797 except (IOError, netrc.NetrcParseError) as err:
798 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
801 if useremail is None:
810 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
813 login_results = compat_urllib_request.urlopen(request).read()
814 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
815 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
817 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
818 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
821 def _real_extract(self, url):
822 mobj = re.match(self._VALID_URL, url)
824 raise ExtractorError(u'Invalid URL: %s' % url)
825 video_id = mobj.group('ID')
827 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
828 webpage = self._download_webpage(url, video_id)
830 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
831 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
832 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
834 raise ExtractorError(u'Cannot parse data')
835 data = dict(json.loads(m.group(1)))
836 params_raw = compat_urllib_parse.unquote(data['params'])
837 params = json.loads(params_raw)
838 video_data = params['video_data'][0]
839 video_url = video_data.get('hd_src')
841 video_url = video_data['sd_src']
843 raise ExtractorError(u'Cannot find video URL')
844 video_duration = int(video_data['video_duration'])
845 thumbnail = video_data['thumbnail_src']
847 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
852 'title': video_title,
855 'duration': video_duration,
856 'thumbnail': thumbnail,
861 class BlipTVIE(InfoExtractor):
862 """Information extractor for blip.tv"""
864 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
865 _URL_EXT = r'^.*\.([a-z0-9]+)$'
868 def report_direct_download(self, title):
869 """Report information extraction."""
870 self.to_screen(u'%s: Direct download detected' % title)
872 def _real_extract(self, url):
873 mobj = re.match(self._VALID_URL, url)
875 raise ExtractorError(u'Invalid URL: %s' % url)
877 # See https://github.com/rg3/youtube-dl/issues/857
878 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
879 if api_mobj is not None:
880 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
881 urlp = compat_urllib_parse_urlparse(url)
882 if urlp.path.startswith('/play/'):
883 request = compat_urllib_request.Request(url)
884 response = compat_urllib_request.urlopen(request)
885 redirecturl = response.geturl()
886 rurlp = compat_urllib_parse_urlparse(redirecturl)
887 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
888 url = 'http://blip.tv/a/a-' + file_id
889 return self._real_extract(url)
896 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
897 request = compat_urllib_request.Request(json_url)
898 request.add_header('User-Agent', 'iTunes/10.6.1')
899 self.report_extraction(mobj.group(1))
902 urlh = compat_urllib_request.urlopen(request)
903 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
904 basename = url.split('/')[-1]
905 title,ext = os.path.splitext(basename)
906 title = title.decode('UTF-8')
907 ext = ext.replace('.', '')
908 self.report_direct_download(title)
918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
919 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
920 if info is None: # Regular URL
922 json_code_bytes = urlh.read()
923 json_code = json_code_bytes.decode('utf-8')
924 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
925 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
928 json_data = json.loads(json_code)
929 if 'Post' in json_data:
930 data = json_data['Post']
934 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
935 video_url = data['media']['url']
936 umobj = re.match(self._URL_EXT, video_url)
938 raise ValueError('Can not determine filename extension')
942 'id': data['item_id'],
944 'uploader': data['display_name'],
945 'upload_date': upload_date,
946 'title': data['title'],
948 'format': data['media']['mimeType'],
949 'thumbnail': data['thumbnailUrl'],
950 'description': data['description'],
951 'player_url': data['embedUrl'],
952 'user_agent': 'iTunes/10.6.1',
954 except (ValueError,KeyError) as err:
955 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
960 class MyVideoIE(InfoExtractor):
961 """Information Extractor for myvideo.de."""
963 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
966 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
967 # Released into the Public Domain by Tristan Fischer on 2013-05-19
968 # https://github.com/rg3/youtube-dl/pull/842
969 def __rc4crypt(self,data, key):
971 box = list(range(256))
972 for i in list(range(256)):
973 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
974 box[i], box[x] = box[x], box[i]
980 y = (y + box[x]) % 256
981 box[x], box[y] = box[y], box[x]
982 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
986 return hashlib.md5(s).hexdigest().encode()
988 def _real_extract(self,url):
989 mobj = re.match(self._VALID_URL, url)
991 raise ExtractorError(u'invalid URL: %s' % url)
993 video_id = mobj.group(1)
996 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
997 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
998 b'TnpsbA0KTVRkbU1tSTRNdz09'
1002 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1003 webpage = self._download_webpage(webpage_url, video_id)
1005 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
1006 if mobj is not None:
1007 self.report_extraction(video_id)
1008 video_url = mobj.group(1) + '.flv'
1010 video_title = self._html_search_regex('<title>([^<]+)</title>',
1013 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
1019 'upload_date': None,
1020 'title': video_title,
1025 mobj = re.search('var flashvars={(.+?)}', webpage)
1027 raise ExtractorError(u'Unable to extract video')
1032 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
1033 if not a == '_encxml':
1036 encxml = compat_urllib_parse.unquote(b)
1037 if not params.get('domain'):
1038 params['domain'] = 'www.myvideo.de'
1039 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
1040 if 'flash_playertype=MTV' in xmldata_url:
1041 self._downloader.report_warning(u'avoiding MTV player')
1043 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
1044 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
1048 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
1049 enc_data_b = binascii.unhexlify(enc_data)
1051 base64.b64decode(base64.b64decode(GK)) +
1053 str(video_id).encode('utf-8')
1056 dec_data = self.__rc4crypt(enc_data_b, sk)
1059 self.report_extraction(video_id)
1062 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1064 video_url = compat_urllib_parse.unquote(mobj.group(1))
1065 if 'myvideo2flash' in video_url:
1066 self._downloader.report_warning(u'forcing RTMPT ...')
1067 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1070 # extract non rtmp videos
1071 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1073 raise ExtractorError(u'unable to extract url')
1074 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1076 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1077 video_file = compat_urllib_parse.unquote(video_file)
1079 if not video_file.endswith('f4m'):
1080 ppath, prefix = video_file.split('.')
1081 video_playpath = '%s:%s' % (prefix, ppath)
1082 video_hls_playlist = ''
1085 video_hls_playlist = (
1086 video_filepath + video_file
1087 ).replace('.f4m', '.m3u8')
1089 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1090 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1092 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1098 'tc_url': video_url,
1100 'upload_date': None,
1101 'title': video_title,
1103 'play_path': video_playpath,
1104 'video_file': video_file,
1105 'video_hls_playlist': video_hls_playlist,
1106 'player_url': video_swfobj,
1110 class ComedyCentralIE(InfoExtractor):
1111 """Information extractor for The Daily Show and Colbert Report """
1113 # urls can be abbreviations like :thedailyshow or :colbert
1114 # urls for episodes like:
1115 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
1116 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
1117 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
1118 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
1119 |(https?://)?(www\.)?
1120 (?P<showname>thedailyshow|colbertnation)\.com/
1121 (full-episodes/(?P<episode>.*)|
1123 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1124 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1127 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1129 _video_extensions = {
1137 _video_dimensions = {
1147 def suitable(cls, url):
1148 """Receives a URL and returns True if suitable for this IE."""
1149 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1151 def _print_formats(self, formats):
1152 print('Available formats:')
1154 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1157 def _real_extract(self, url):
1158 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1160 raise ExtractorError(u'Invalid URL: %s' % url)
1162 if mobj.group('shortname'):
1163 if mobj.group('shortname') in ('tds', 'thedailyshow'):
1164 url = u'http://www.thedailyshow.com/full-episodes/'
1166 url = u'http://www.colbertnation.com/full-episodes/'
1167 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1168 assert mobj is not None
1170 if mobj.group('clip'):
1171 if mobj.group('showname') == 'thedailyshow':
1172 epTitle = mobj.group('tdstitle')
1174 epTitle = mobj.group('cntitle')
1177 dlNewest = not mobj.group('episode')
1179 epTitle = mobj.group('showname')
1181 epTitle = mobj.group('episode')
1183 self.report_extraction(epTitle)
1184 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1186 url = htmlHandle.geturl()
1187 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1189 raise ExtractorError(u'Invalid redirected URL: ' + url)
1190 if mobj.group('episode') == '':
1191 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1192 epTitle = mobj.group('episode')
1194 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1196 if len(mMovieParams) == 0:
1197 # The Colbert Report embeds the information in a without
1198 # a URL prefix; so extract the alternate reference
1199 # and then add the URL prefix manually.
1201 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1202 if len(altMovieParams) == 0:
1203 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1205 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1207 uri = mMovieParams[0][1]
1208 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1209 indexXml = self._download_webpage(indexUrl, epTitle,
1210 u'Downloading show index',
1211 u'unable to download episode index')
1215 idoc = xml.etree.ElementTree.fromstring(indexXml)
1216 itemEls = idoc.findall('.//item')
1217 for partNum,itemEl in enumerate(itemEls):
1218 mediaId = itemEl.findall('./guid')[0].text
1219 shortMediaId = mediaId.split(':')[-1]
1220 showId = mediaId.split(':')[-2].replace('.com', '')
1221 officialTitle = itemEl.findall('./title')[0].text
1222 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1224 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1225 compat_urllib_parse.urlencode({'uri': mediaId}))
1226 configXml = self._download_webpage(configUrl, epTitle,
1227 u'Downloading configuration for %s' % shortMediaId)
1229 cdoc = xml.etree.ElementTree.fromstring(configXml)
1231 for rendition in cdoc.findall('.//rendition'):
1232 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1236 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1239 if self._downloader.params.get('listformats', None):
1240 self._print_formats([i[0] for i in turls])
1243 # For now, just pick the highest bitrate
1244 format,rtmp_video_url = turls[-1]
1246 # Get the format arg from the arg stream
1247 req_format = self._downloader.params.get('format', None)
1249 # Select format if we can find one
1252 format, rtmp_video_url = f, v
1255 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1257 raise ExtractorError(u'Cannot transform RTMP url')
1258 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1259 video_url = base + m.group('finalid')
1261 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1266 'upload_date': officialDate,
1271 'description': officialTitle,
1273 results.append(info)
1278 class EscapistIE(InfoExtractor):
1279 """Information extractor for The Escapist """
1281 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1282 IE_NAME = u'escapist'
1284 def _real_extract(self, url):
1285 mobj = re.match(self._VALID_URL, url)
1287 raise ExtractorError(u'Invalid URL: %s' % url)
1288 showName = mobj.group('showname')
1289 videoId = mobj.group('episode')
1291 self.report_extraction(videoId)
1292 webpage = self._download_webpage(url, videoId)
1294 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1295 webpage, u'description', fatal=False)
1297 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1298 webpage, u'thumbnail', fatal=False)
1300 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1301 webpage, u'player url')
1303 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1304 webpage, u'player url').split(' : ')[-1]
1306 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1307 configUrl = compat_urllib_parse.unquote(configUrl)
1309 configJSON = self._download_webpage(configUrl, videoId,
1310 u'Downloading configuration',
1311 u'unable to download configuration')
1313 # Technically, it's JavaScript, not JSON
1314 configJSON = configJSON.replace("'", '"')
1317 config = json.loads(configJSON)
1318 except (ValueError,) as err:
1319 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1321 playlist = config['playlist']
1322 videoUrl = playlist[1]['url']
1327 'uploader': showName,
1328 'upload_date': None,
1331 'thumbnail': imgUrl,
1332 'description': videoDesc,
1333 'player_url': playerUrl,
1338 class CollegeHumorIE(InfoExtractor):
1339 """Information extractor for collegehumor.com"""
1342 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1343 IE_NAME = u'collegehumor'
1345 def report_manifest(self, video_id):
1346 """Report information extraction."""
1347 self.to_screen(u'%s: Downloading XML manifest' % video_id)
1349 def _real_extract(self, url):
1350 mobj = re.match(self._VALID_URL, url)
1352 raise ExtractorError(u'Invalid URL: %s' % url)
1353 video_id = mobj.group('videoid')
1358 'upload_date': None,
1361 self.report_extraction(video_id)
1362 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1364 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1366 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1368 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1370 videoNode = mdoc.findall('./video')[0]
1371 info['description'] = videoNode.findall('./description')[0].text
1372 info['title'] = videoNode.findall('./caption')[0].text
1373 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1374 manifest_url = videoNode.findall('./file')[0].text
1376 raise ExtractorError(u'Invalid metadata XML file')
1378 manifest_url += '?hdcore=2.10.3'
1379 self.report_manifest(video_id)
1381 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1382 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1383 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1385 adoc = xml.etree.ElementTree.fromstring(manifestXml)
1387 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1388 node_id = media_node.attrib['url']
1389 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1390 except IndexError as err:
1391 raise ExtractorError(u'Invalid manifest file')
1393 url_pr = compat_urllib_parse_urlparse(manifest_url)
1394 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1401 class XVideosIE(InfoExtractor):
1402 """Information extractor for xvideos.com"""
1404 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1405 IE_NAME = u'xvideos'
1407 def _real_extract(self, url):
1408 mobj = re.match(self._VALID_URL, url)
1410 raise ExtractorError(u'Invalid URL: %s' % url)
1411 video_id = mobj.group(1)
1413 webpage = self._download_webpage(url, video_id)
1415 self.report_extraction(video_id)
1418 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1419 webpage, u'video URL'))
1422 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1425 # Extract video thumbnail
1426 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1427 webpage, u'thumbnail', fatal=False)
1433 'upload_date': None,
1434 'title': video_title,
1436 'thumbnail': video_thumbnail,
1437 'description': None,
1443 class SoundcloudIE(InfoExtractor):
1444 """Information extractor for soundcloud.com
1445 To access the media, the uid of the song and a stream token
1446 must be extracted from the page source and the script must make
1447 a request to media.soundcloud.com/crossdomain.xml. Then
1448 the media can be grabbed by requesting from an url composed
1449 of the stream token and uid
1452 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1453 IE_NAME = u'soundcloud'
1455 def report_resolve(self, video_id):
1456 """Report information extraction."""
1457 self.to_screen(u'%s: Resolving id' % video_id)
1459 def _real_extract(self, url):
1460 mobj = re.match(self._VALID_URL, url)
1462 raise ExtractorError(u'Invalid URL: %s' % url)
1464 # extract uploader (which is in the url)
1465 uploader = mobj.group(1)
1466 # extract simple title (uploader + slug of song title)
1467 slug_title = mobj.group(2)
1468 simple_title = uploader + u'-' + slug_title
1469 full_title = '%s/%s' % (uploader, slug_title)
1471 self.report_resolve(full_title)
1473 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1474 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1475 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1477 info = json.loads(info_json)
1478 video_id = info['id']
1479 self.report_extraction(full_title)
1481 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1482 stream_json = self._download_webpage(streams_url, full_title,
1483 u'Downloading stream definitions',
1484 u'unable to download stream definitions')
1486 streams = json.loads(stream_json)
1487 mediaURL = streams['http_mp3_128_url']
1488 upload_date = unified_strdate(info['created_at'])
1493 'uploader': info['user']['username'],
1494 'upload_date': upload_date,
1495 'title': info['title'],
1497 'description': info['description'],
1500 class SoundcloudSetIE(InfoExtractor):
1501 """Information extractor for soundcloud.com sets
1502 To access the media, the uid of the song and a stream token
1503 must be extracted from the page source and the script must make
1504 a request to media.soundcloud.com/crossdomain.xml. Then
1505 the media can be grabbed by requesting from an url composed
1506 of the stream token and uid
1509 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1510 IE_NAME = u'soundcloud:set'
1512 def report_resolve(self, video_id):
1513 """Report information extraction."""
1514 self.to_screen(u'%s: Resolving id' % video_id)
1516 def _real_extract(self, url):
1517 mobj = re.match(self._VALID_URL, url)
1519 raise ExtractorError(u'Invalid URL: %s' % url)
1521 # extract uploader (which is in the url)
1522 uploader = mobj.group(1)
1523 # extract simple title (uploader + slug of song title)
1524 slug_title = mobj.group(2)
1525 simple_title = uploader + u'-' + slug_title
1526 full_title = '%s/sets/%s' % (uploader, slug_title)
1528 self.report_resolve(full_title)
1530 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1531 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1532 info_json = self._download_webpage(resolv_url, full_title)
1535 info = json.loads(info_json)
1536 if 'errors' in info:
1537 for err in info['errors']:
1538 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1541 self.report_extraction(full_title)
1542 for track in info['tracks']:
1543 video_id = track['id']
1545 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1546 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1548 self.report_extraction(video_id)
1549 streams = json.loads(stream_json)
1550 mediaURL = streams['http_mp3_128_url']
1555 'uploader': track['user']['username'],
1556 'upload_date': unified_strdate(track['created_at']),
1557 'title': track['title'],
1559 'description': track['description'],
1564 class InfoQIE(InfoExtractor):
1565 """Information extractor for infoq.com"""
1566 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1568 def _real_extract(self, url):
1569 mobj = re.match(self._VALID_URL, url)
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1573 webpage = self._download_webpage(url, video_id=url)
1574 self.report_extraction(url)
1577 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1579 raise ExtractorError(u'Unable to extract video url')
1580 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1581 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1584 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1587 # Extract description
1588 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1589 webpage, u'description', fatal=False)
1591 video_filename = video_url.split('/')[-1]
1592 video_id, extension = video_filename.split('.')
1598 'upload_date': None,
1599 'title': video_title,
1600 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1602 'description': video_description,
1607 class MixcloudIE(InfoExtractor):
1608 """Information extractor for www.mixcloud.com"""
1610 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1611 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1612 IE_NAME = u'mixcloud'
1614 def report_download_json(self, file_id):
1615 """Report JSON download."""
1616 self.to_screen(u'Downloading json')
1618 def get_urls(self, jsonData, fmt, bitrate='best'):
1619 """Get urls from 'audio_formats' section in json"""
1622 bitrate_list = jsonData[fmt]
1623 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1624 bitrate = max(bitrate_list) # select highest
1626 url_list = jsonData[fmt][bitrate]
1627 except TypeError: # we have no bitrate info.
1628 url_list = jsonData[fmt]
1631 def check_urls(self, url_list):
1632 """Returns 1st active url from list"""
1633 for url in url_list:
1635 compat_urllib_request.urlopen(url)
1637 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642 def _print_formats(self, formats):
1643 print('Available formats:')
1644 for fmt in formats.keys():
1645 for b in formats[fmt]:
1647 ext = formats[fmt][b][0]
1648 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1649 except TypeError: # we have no bitrate info
1650 ext = formats[fmt][0]
1651 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1654 def _real_extract(self, url):
1655 mobj = re.match(self._VALID_URL, url)
1657 raise ExtractorError(u'Invalid URL: %s' % url)
1658 # extract uploader & filename from url
1659 uploader = mobj.group(1).decode('utf-8')
1660 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1662 # construct API request
1663 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1664 # retrieve .json file with links to files
1665 request = compat_urllib_request.Request(file_url)
1667 self.report_download_json(file_url)
1668 jsonData = compat_urllib_request.urlopen(request).read()
1669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1670 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1673 json_data = json.loads(jsonData)
1674 player_url = json_data['player_swf_url']
1675 formats = dict(json_data['audio_formats'])
1677 req_format = self._downloader.params.get('format', None)
1680 if self._downloader.params.get('listformats', None):
1681 self._print_formats(formats)
1684 if req_format is None or req_format == 'best':
1685 for format_param in formats.keys():
1686 url_list = self.get_urls(formats, format_param)
1688 file_url = self.check_urls(url_list)
1689 if file_url is not None:
1692 if req_format not in formats:
1693 raise ExtractorError(u'Format is not available')
1695 url_list = self.get_urls(formats, req_format)
1696 file_url = self.check_urls(url_list)
1697 format_param = req_format
1700 'id': file_id.decode('utf-8'),
1701 'url': file_url.decode('utf-8'),
1702 'uploader': uploader.decode('utf-8'),
1703 'upload_date': None,
1704 'title': json_data['name'],
1705 'ext': file_url.split('.')[-1].decode('utf-8'),
1706 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1707 'thumbnail': json_data['thumbnail_url'],
1708 'description': json_data['description'],
1709 'player_url': player_url.decode('utf-8'),
1712 class StanfordOpenClassroomIE(InfoExtractor):
1713 """Information extractor for Stanford's Open ClassRoom"""
1715 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1716 IE_NAME = u'stanfordoc'
1718 def _real_extract(self, url):
1719 mobj = re.match(self._VALID_URL, url)
1721 raise ExtractorError(u'Invalid URL: %s' % url)
1723 if mobj.group('course') and mobj.group('video'): # A specific video
1724 course = mobj.group('course')
1725 video = mobj.group('video')
1727 'id': course + '_' + video,
1729 'upload_date': None,
1732 self.report_extraction(info['id'])
1733 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1734 xmlUrl = baseUrl + video + '.xml'
1736 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1737 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1738 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1739 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1741 info['title'] = mdoc.findall('./title')[0].text
1742 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1744 raise ExtractorError(u'Invalid metadata XML file')
1745 info['ext'] = info['url'].rpartition('.')[2]
1747 elif mobj.group('course'): # A course page
1748 course = mobj.group('course')
1753 'upload_date': None,
1756 coursepage = self._download_webpage(url, info['id'],
1757 note='Downloading course info page',
1758 errnote='Unable to download course info page')
1760 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1762 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1763 coursepage, u'description', fatal=False)
1765 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1768 'type': 'reference',
1769 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1773 for entry in info['list']:
1774 assert entry['type'] == 'reference'
1775 results += self.extract(entry['url'])
1779 'id': 'Stanford OpenClassroom',
1782 'upload_date': None,
1785 self.report_download_webpage(info['id'])
1786 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1788 rootpage = compat_urllib_request.urlopen(rootURL).read()
1789 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1790 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1792 info['title'] = info['id']
1794 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1797 'type': 'reference',
1798 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1803 for entry in info['list']:
1804 assert entry['type'] == 'reference'
1805 results += self.extract(entry['url'])
1808 class MTVIE(InfoExtractor):
1809 """Information extractor for MTV.com"""
1811 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1814 def _real_extract(self, url):
1815 mobj = re.match(self._VALID_URL, url)
1817 raise ExtractorError(u'Invalid URL: %s' % url)
1818 if not mobj.group('proto'):
1819 url = 'http://' + url
1820 video_id = mobj.group('videoid')
1822 webpage = self._download_webpage(url, video_id)
1824 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1825 webpage, u'song name', fatal=False)
1827 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1830 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1831 webpage, u'mtvn_uri', fatal=False)
1833 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1834 webpage, u'content id', fatal=False)
1836 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1837 self.report_extraction(video_id)
1838 request = compat_urllib_request.Request(videogen_url)
1840 metadataXml = compat_urllib_request.urlopen(request).read()
1841 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1842 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1844 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1845 renditions = mdoc.findall('.//rendition')
1847 # For now, always pick the highest quality.
1848 rendition = renditions[-1]
1851 _,_,ext = rendition.attrib['type'].partition('/')
1852 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1853 video_url = rendition.find('./src').text
1855 raise ExtractorError('Invalid rendition field.')
1860 'uploader': performer,
1861 'upload_date': None,
1862 'title': video_title,
1870 class YoukuIE(InfoExtractor):
1871 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1874 nowTime = int(time.time() * 1000)
1875 random1 = random.randint(1000,1998)
1876 random2 = random.randint(1000,9999)
1878 return "%d%d%d" %(nowTime,random1,random2)
1880 def _get_file_ID_mix_string(self, seed):
1882 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1884 for i in range(len(source)):
1885 seed = (seed * 211 + 30031 ) % 65536
1886 index = math.floor(seed / 65536 * len(source) )
1887 mixed.append(source[int(index)])
1888 source.remove(source[int(index)])
1889 #return ''.join(mixed)
1892 def _get_file_id(self, fileId, seed):
1893 mixed = self._get_file_ID_mix_string(seed)
1894 ids = fileId.split('*')
1898 realId.append(mixed[int(ch)])
1899 return ''.join(realId)
1901 def _real_extract(self, url):
1902 mobj = re.match(self._VALID_URL, url)
1904 raise ExtractorError(u'Invalid URL: %s' % url)
1905 video_id = mobj.group('ID')
1907 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1909 jsondata = self._download_webpage(info_url, video_id)
1911 self.report_extraction(video_id)
1913 config = json.loads(jsondata)
1915 video_title = config['data'][0]['title']
1916 seed = config['data'][0]['seed']
1918 format = self._downloader.params.get('format', None)
1919 supported_format = list(config['data'][0]['streamfileids'].keys())
1921 if format is None or format == 'best':
1922 if 'hd2' in supported_format:
1927 elif format == 'worst':
1935 fileid = config['data'][0]['streamfileids'][format]
1936 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1937 except (UnicodeDecodeError, ValueError, KeyError):
1938 raise ExtractorError(u'Unable to extract info section')
1941 sid = self._gen_sid()
1942 fileid = self._get_file_id(fileid, seed)
1944 #column 8,9 of fileid represent the segment number
1945 #fileid[7:9] should be changed
1946 for index, key in enumerate(keys):
1948 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1949 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1952 'id': '%s_part%02d' % (video_id, index),
1953 'url': download_url,
1955 'upload_date': None,
1956 'title': video_title,
1959 files_info.append(info)
1964 class XNXXIE(InfoExtractor):
1965 """Information extractor for xnxx.com"""
1967 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1969 VIDEO_URL_RE = r'flv_url=(.*?)&'
1970 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1971 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1973 def _real_extract(self, url):
1974 mobj = re.match(self._VALID_URL, url)
1976 raise ExtractorError(u'Invalid URL: %s' % url)
1977 video_id = mobj.group(1)
1979 # Get webpage content
1980 webpage = self._download_webpage(url, video_id)
1982 video_url = self._search_regex(self.VIDEO_URL_RE,
1983 webpage, u'video URL')
1984 video_url = compat_urllib_parse.unquote(video_url)
1986 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1989 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1990 webpage, u'thumbnail', fatal=False)
1996 'upload_date': None,
1997 'title': video_title,
1999 'thumbnail': video_thumbnail,
2000 'description': None,
2004 class GooglePlusIE(InfoExtractor):
2005 """Information extractor for plus.google.com."""
2007 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2008 IE_NAME = u'plus.google'
2010 def _real_extract(self, url):
2011 # Extract id from URL
2012 mobj = re.match(self._VALID_URL, url)
2014 raise ExtractorError(u'Invalid URL: %s' % url)
2016 post_url = mobj.group(0)
2017 video_id = mobj.group(1)
2019 video_extension = 'flv'
2021 # Step 1, Retrieve post webpage to extract further information
2022 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2024 self.report_extraction(video_id)
2026 # Extract update date
2027 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
2028 webpage, u'upload date', fatal=False)
2030 # Convert timestring to a format suitable for filename
2031 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2032 upload_date = upload_date.strftime('%Y%m%d')
2035 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
2036 webpage, u'uploader', fatal=False)
2039 # Get the first line for title
2040 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
2041 webpage, 'title', default=u'NA')
2043 # Step 2, Stimulate clicking the image box to launch video
2044 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
2045 webpage, u'video page URL')
2046 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
2048 # Extract video links on video page
2049 """Extract video links of all sizes"""
2050 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
2051 mobj = re.findall(pattern, webpage)
2053 raise ExtractorError(u'Unable to extract video links')
2055 # Sort in resolution
2056 links = sorted(mobj)
2058 # Choose the lowest of the sort, i.e. highest resolution
2059 video_url = links[-1]
2060 # Only get the url. The resolution part in the tuple has no use anymore
2061 video_url = video_url[-1]
2062 # Treat escaped \u0026 style hex
2064 video_url = video_url.decode("unicode_escape")
2065 except AttributeError: # Python 3
2066 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2072 'uploader': uploader,
2073 'upload_date': upload_date,
2074 'title': video_title,
2075 'ext': video_extension,
2078 class NBAIE(InfoExtractor):
2079 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2082 def _real_extract(self, url):
2083 mobj = re.match(self._VALID_URL, url)
2085 raise ExtractorError(u'Invalid URL: %s' % url)
2087 video_id = mobj.group(1)
2089 webpage = self._download_webpage(url, video_id)
2091 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2093 shortened_video_id = video_id.rpartition('/')[2]
2094 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2095 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2097 # It isn't there in the HTML it returns to us
2098 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2100 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2103 'id': shortened_video_id,
2107 # 'uploader_date': uploader_date,
2108 'description': description,
2112 class JustinTVIE(InfoExtractor):
2113 """Information extractor for justin.tv and twitch.tv"""
2114 # TODO: One broadcast may be split into multiple videos. The key
2115 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
2116 # starts at 1 and increases. Can we treat all parts as one video?
2118 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
2120 (?P<channelid>[^/]+)|
2121 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2122 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2126 _JUSTIN_PAGE_LIMIT = 100
2127 IE_NAME = u'justin.tv'
2129 def report_download_page(self, channel, offset):
2130 """Report attempt to download a single page of videos."""
2131 self.to_screen(u'%s: Downloading video information from %d to %d' %
2132 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2134 # Return count of items, list of *valid* items
2135 def _parse_page(self, url, video_id):
2136 webpage = self._download_webpage(url, video_id,
2137 u'Downloading video info JSON',
2138 u'unable to download video info JSON')
2140 response = json.loads(webpage)
2141 if type(response) != list:
2142 error_text = response.get('error', 'unknown error')
2143 raise ExtractorError(u'Justin.tv API: %s' % error_text)
2145 for clip in response:
2146 video_url = clip['video_file_url']
2148 video_extension = os.path.splitext(video_url)[1][1:]
2149 video_date = re.sub('-', '', clip['start_time'][:10])
2150 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2151 video_id = clip['id']
2152 video_title = clip.get('title', video_id)
2156 'title': video_title,
2157 'uploader': clip.get('channel_name', video_uploader_id),
2158 'uploader_id': video_uploader_id,
2159 'upload_date': video_date,
2160 'ext': video_extension,
2162 return (len(response), info)
2164 def _real_extract(self, url):
2165 mobj = re.match(self._VALID_URL, url)
2167 raise ExtractorError(u'invalid URL: %s' % url)
2169 api_base = 'http://api.justin.tv'
2171 if mobj.group('channelid'):
2173 video_id = mobj.group('channelid')
2174 api = api_base + '/channel/archives/%s.json' % video_id
2175 elif mobj.group('chapterid'):
2176 chapter_id = mobj.group('chapterid')
2178 webpage = self._download_webpage(url, chapter_id)
2179 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2181 raise ExtractorError(u'Cannot find archive of a chapter')
2182 archive_id = m.group(1)
2184 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2185 chapter_info_xml = self._download_webpage(api, chapter_id,
2186 note=u'Downloading chapter information',
2187 errnote=u'Chapter information download failed')
2188 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2189 for a in doc.findall('.//archive'):
2190 if archive_id == a.find('./id').text:
2193 raise ExtractorError(u'Could not find chapter in chapter information')
2195 video_url = a.find('./video_file_url').text
2196 video_ext = video_url.rpartition('.')[2] or u'flv'
2198 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2199 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2200 note='Downloading chapter metadata',
2201 errnote='Download of chapter metadata failed')
2202 chapter_info = json.loads(chapter_info_json)
2204 bracket_start = int(doc.find('.//bracket_start').text)
2205 bracket_end = int(doc.find('.//bracket_end').text)
2207 # TODO determine start (and probably fix up file)
2208 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2209 #video_url += u'?start=' + TODO:start_timestamp
2210 # bracket_start is 13290, but we want 51670615
2211 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2212 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2215 'id': u'c' + chapter_id,
2218 'title': chapter_info['title'],
2219 'thumbnail': chapter_info['preview'],
2220 'description': chapter_info['description'],
2221 'uploader': chapter_info['channel']['display_name'],
2222 'uploader_id': chapter_info['channel']['name'],
2226 video_id = mobj.group('videoid')
2227 api = api_base + '/broadcast/by_archive/%s.json' % video_id
2229 self.report_extraction(video_id)
2233 limit = self._JUSTIN_PAGE_LIMIT
2236 self.report_download_page(video_id, offset)
2237 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2238 page_count, page_info = self._parse_page(page_url, video_id)
2239 info.extend(page_info)
2240 if not paged or page_count != limit:
2245 class FunnyOrDieIE(InfoExtractor):
2246 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2248 def _real_extract(self, url):
2249 mobj = re.match(self._VALID_URL, url)
2251 raise ExtractorError(u'invalid URL: %s' % url)
2253 video_id = mobj.group('id')
2254 webpage = self._download_webpage(url, video_id)
2256 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2257 webpage, u'video URL', flags=re.DOTALL)
2259 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2260 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2262 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2263 webpage, u'description', fatal=False, flags=re.DOTALL)
2270 'description': video_description,
2274 class SteamIE(InfoExtractor):
2275 _VALID_URL = r"""http://store\.steampowered\.com/
2277 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2279 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2281 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2282 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2285 def suitable(cls, url):
2286 """Receives a URL and returns True if suitable for this IE."""
2287 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2289 def _real_extract(self, url):
2290 m = re.match(self._VALID_URL, url, re.VERBOSE)
2291 gameID = m.group('gameID')
2293 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2294 webpage = self._download_webpage(videourl, gameID)
2296 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2297 videourl = self._AGECHECK_TEMPLATE % gameID
2298 self.report_age_confirmation()
2299 webpage = self._download_webpage(videourl, gameID)
2301 self.report_extraction(gameID)
2302 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2303 webpage, 'game title')
2305 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2306 mweb = re.finditer(urlRE, webpage)
2307 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2308 titles = re.finditer(namesRE, webpage)
2309 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2310 thumbs = re.finditer(thumbsRE, webpage)
2312 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2313 video_id = vid.group('videoID')
2314 title = vtitle.group('videoName')
2315 video_url = vid.group('videoURL')
2316 video_thumb = thumb.group('thumbnail')
2318 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2323 'title': unescapeHTML(title),
2324 'thumbnail': video_thumb
2327 return [self.playlist_result(videos, gameID, game_title)]
2329 class UstreamIE(InfoExtractor):
2330 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2331 IE_NAME = u'ustream'
2333 def _real_extract(self, url):
2334 m = re.match(self._VALID_URL, url)
2335 video_id = m.group('videoID')
2337 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2338 webpage = self._download_webpage(url, video_id)
2340 self.report_extraction(video_id)
2342 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2345 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2346 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2348 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2349 webpage, u'thumbnail', fatal=False)
2355 'title': video_title,
2356 'uploader': uploader,
2357 'thumbnail': thumbnail,
2361 class WorldStarHipHopIE(InfoExtractor):
2362 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2363 IE_NAME = u'WorldStarHipHop'
2365 def _real_extract(self, url):
2366 m = re.match(self._VALID_URL, url)
2367 video_id = m.group('id')
2369 webpage_src = self._download_webpage(url, video_id)
2371 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2372 webpage_src, u'video URL')
2374 if 'mp4' in video_url:
2379 video_title = self._html_search_regex(r"<title>(.*)</title>",
2380 webpage_src, u'title')
2382 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2383 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2384 webpage_src, u'thumbnail', fatal=False)
2387 _title = r"""candytitles.*>(.*)</span>"""
2388 mobj = re.search(_title, webpage_src)
2389 if mobj is not None:
2390 video_title = mobj.group(1)
2395 'title' : video_title,
2396 'thumbnail' : thumbnail,
2401 class RBMARadioIE(InfoExtractor):
2402 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2404 def _real_extract(self, url):
2405 m = re.match(self._VALID_URL, url)
2406 video_id = m.group('videoID')
2408 webpage = self._download_webpage(url, video_id)
2410 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2411 webpage, u'json data', flags=re.MULTILINE)
2414 data = json.loads(json_data)
2415 except ValueError as e:
2416 raise ExtractorError(u'Invalid JSON: ' + str(e))
2418 video_url = data['akamai_url'] + '&cbr=256'
2419 url_parts = compat_urllib_parse_urlparse(video_url)
2420 video_ext = url_parts.path.rpartition('.')[2]
2425 'title': data['title'],
2426 'description': data.get('teaser_text'),
2427 'location': data.get('country_of_origin'),
2428 'uploader': data.get('host', {}).get('name'),
2429 'uploader_id': data.get('host', {}).get('slug'),
2430 'thumbnail': data.get('image', {}).get('large_url_2x'),
2431 'duration': data.get('duration'),
2436 class YouPornIE(InfoExtractor):
2437 """Information extractor for youporn.com."""
2438 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2440 def _print_formats(self, formats):
2441 """Print all available formats"""
2442 print(u'Available formats:')
2443 print(u'ext\t\tformat')
2444 print(u'---------------------------------')
2445 for format in formats:
2446 print(u'%s\t\t%s' % (format['ext'], format['format']))
2448 def _specific(self, req_format, formats):
2450 if(x["format"]==req_format):
2454 def _real_extract(self, url):
2455 mobj = re.match(self._VALID_URL, url)
2457 raise ExtractorError(u'Invalid URL: %s' % url)
2458 video_id = mobj.group('videoid')
2460 req = compat_urllib_request.Request(url)
2461 req.add_header('Cookie', 'age_verified=1')
2462 webpage = self._download_webpage(req, video_id)
2464 # Get JSON parameters
2465 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2467 params = json.loads(json_params)
2469 raise ExtractorError(u'Invalid JSON')
2471 self.report_extraction(video_id)
2473 video_title = params['title']
2474 upload_date = unified_strdate(params['release_date_f'])
2475 video_description = params['description']
2476 video_uploader = params['submitted_by']
2477 thumbnail = params['thumbnails'][0]['image']
2479 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2481 # Get all of the formats available
2482 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2483 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2484 webpage, u'download list').strip()
2486 # Get all of the links from the page
2487 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2488 links = re.findall(LINK_RE, download_list_html)
2489 if(len(links) == 0):
2490 raise ExtractorError(u'ERROR: no known formats available for video')
2492 self.to_screen(u'Links found: %d' % len(links))
2497 # A link looks like this:
2498 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2499 # A path looks like this:
2500 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2501 video_url = unescapeHTML( link )
2502 path = compat_urllib_parse_urlparse( video_url ).path
2503 extension = os.path.splitext( path )[1][1:]
2504 format = path.split('/')[4].split('_')[:2]
2507 format = "-".join( format )
2508 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2513 'uploader': video_uploader,
2514 'upload_date': upload_date,
2515 'title': video_title,
2518 'thumbnail': thumbnail,
2519 'description': video_description
2522 if self._downloader.params.get('listformats', None):
2523 self._print_formats(formats)
2526 req_format = self._downloader.params.get('format', None)
2527 self.to_screen(u'Format: %s' % req_format)
2529 if req_format is None or req_format == 'best':
2531 elif req_format == 'worst':
2532 return [formats[-1]]
2533 elif req_format in ('-1', 'all'):
2536 format = self._specific( req_format, formats )
2538 raise ExtractorError(u'Requested format not available')
2543 class PornotubeIE(InfoExtractor):
2544 """Information extractor for pornotube.com."""
2545 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2547 def _real_extract(self, url):
2548 mobj = re.match(self._VALID_URL, url)
2550 raise ExtractorError(u'Invalid URL: %s' % url)
2552 video_id = mobj.group('videoid')
2553 video_title = mobj.group('title')
2555 # Get webpage content
2556 webpage = self._download_webpage(url, video_id)
2559 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2560 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2561 video_url = compat_urllib_parse.unquote(video_url)
2563 #Get the uploaded date
2564 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2565 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2566 if upload_date: upload_date = unified_strdate(upload_date)
2568 info = {'id': video_id,
2571 'upload_date': upload_date,
2572 'title': video_title,
2578 class YouJizzIE(InfoExtractor):
2579 """Information extractor for youjizz.com."""
2580 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2582 def _real_extract(self, url):
2583 mobj = re.match(self._VALID_URL, url)
2585 raise ExtractorError(u'Invalid URL: %s' % url)
2587 video_id = mobj.group('videoid')
2589 # Get webpage content
2590 webpage = self._download_webpage(url, video_id)
2592 # Get the video title
2593 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2594 webpage, u'title').strip()
2596 # Get the embed page
2597 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2599 raise ExtractorError(u'ERROR: unable to extract embed page')
2601 embed_page_url = result.group(0).strip()
2602 video_id = result.group('videoid')
2604 webpage = self._download_webpage(embed_page_url, video_id)
2607 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2608 webpage, u'video URL')
2610 info = {'id': video_id,
2612 'title': video_title,
2615 'player_url': embed_page_url}
2619 class EightTracksIE(InfoExtractor):
2621 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2623 def _real_extract(self, url):
2624 mobj = re.match(self._VALID_URL, url)
2626 raise ExtractorError(u'Invalid URL: %s' % url)
2627 playlist_id = mobj.group('id')
2629 webpage = self._download_webpage(url, playlist_id)
2631 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2632 data = json.loads(json_like)
2634 session = str(random.randint(0, 1000000000))
2636 track_count = data['tracks_count']
2637 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2638 next_url = first_url
2640 for i in itertools.count():
2641 api_json = self._download_webpage(next_url, playlist_id,
2642 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2643 errnote=u'Failed to download song information')
2644 api_data = json.loads(api_json)
2645 track_data = api_data[u'set']['track']
2647 'id': track_data['id'],
2648 'url': track_data['track_file_stream_url'],
2649 'title': track_data['performer'] + u' - ' + track_data['name'],
2650 'raw_title': track_data['name'],
2651 'uploader_id': data['user']['login'],
2655 if api_data['set']['at_last_track']:
2657 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2660 class KeekIE(InfoExtractor):
2661 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2664 def _real_extract(self, url):
2665 m = re.match(self._VALID_URL, url)
2666 video_id = m.group('videoID')
2668 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2669 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2670 webpage = self._download_webpage(url, video_id)
2672 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2675 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2676 webpage, u'uploader', fatal=False)
2682 'title': video_title,
2683 'thumbnail': thumbnail,
2684 'uploader': uploader
2688 class TEDIE(InfoExtractor):
2689 _VALID_URL=r'''http://www\.ted\.com/
2691 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2693 ((?P<type_talk>talks)) # We have a simple talk
2695 (/lang/(.*?))? # The url may contain the language
2696 /(?P<name>\w+) # Here goes the name and then ".html"
2700 def suitable(cls, url):
2701 """Receives a URL and returns True if suitable for this IE."""
2702 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2704 def _real_extract(self, url):
2705 m=re.match(self._VALID_URL, url, re.VERBOSE)
2706 if m.group('type_talk'):
2707 return [self._talk_info(url)]
2709 playlist_id=m.group('playlist_id')
2710 name=m.group('name')
2711 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2712 return [self._playlist_videos_info(url,name,playlist_id)]
2714 def _playlist_videos_info(self,url,name,playlist_id=0):
2715 '''Returns the videos of the playlist'''
2717 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2718 ([.\s]*?)data-playlist_item_id="(\d+)"
2719 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2721 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2722 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2723 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2724 m_names=re.finditer(video_name_RE,webpage)
2726 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2727 webpage, 'playlist title')
2729 playlist_entries = []
2730 for m_video, m_name in zip(m_videos,m_names):
2731 video_id=m_video.group('video_id')
2732 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2733 playlist_entries.append(self.url_result(talk_url, 'TED'))
2734 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2736 def _talk_info(self, url, video_id=0):
2737 """Return the video for the talk in the url"""
2738 m = re.match(self._VALID_URL, url,re.VERBOSE)
2739 video_name = m.group('name')
2740 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2741 self.report_extraction(video_name)
2742 # If the url includes the language we get the title translated
2743 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2745 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2746 webpage, 'json data')
2747 info = json.loads(json_data)
2748 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2749 webpage, 'description', flags = re.DOTALL)
2751 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2752 webpage, 'thumbnail')
2755 'url': info['htmlStreams'][-1]['file'],
2758 'thumbnail': thumbnail,
2759 'description': desc,
2763 class MySpassIE(InfoExtractor):
2764 _VALID_URL = r'http://www.myspass.de/.*'
2766 def _real_extract(self, url):
2767 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2769 # video id is the last path element of the URL
2770 # usually there is a trailing slash, so also try the second but last
2771 url_path = compat_urllib_parse_urlparse(url).path
2772 url_parent_path, video_id = os.path.split(url_path)
2774 _, video_id = os.path.split(url_parent_path)
2777 metadata_url = META_DATA_URL_TEMPLATE % video_id
2778 metadata_text = self._download_webpage(metadata_url, video_id)
2779 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2781 # extract values from metadata
2782 url_flv_el = metadata.find('url_flv')
2783 if url_flv_el is None:
2784 raise ExtractorError(u'Unable to extract download url')
2785 video_url = url_flv_el.text
2786 extension = os.path.splitext(video_url)[1][1:]
2787 title_el = metadata.find('title')
2788 if title_el is None:
2789 raise ExtractorError(u'Unable to extract title')
2790 title = title_el.text
2791 format_id_el = metadata.find('format_id')
2792 if format_id_el is None:
2795 format = format_id_el.text
2796 description_el = metadata.find('description')
2797 if description_el is not None:
2798 description = description_el.text
2801 imagePreview_el = metadata.find('imagePreview')
2802 if imagePreview_el is not None:
2803 thumbnail = imagePreview_el.text
2812 'thumbnail': thumbnail,
2813 'description': description
2817 class SpiegelIE(InfoExtractor):
2818 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2820 def _real_extract(self, url):
2821 m = re.match(self._VALID_URL, url)
2822 video_id = m.group('videoID')
2824 webpage = self._download_webpage(url, video_id)
2826 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2829 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2830 xml_code = self._download_webpage(xml_url, video_id,
2831 note=u'Downloading XML', errnote=u'Failed to download XML')
2833 idoc = xml.etree.ElementTree.fromstring(xml_code)
2834 last_type = idoc[-1]
2835 filename = last_type.findall('./filename')[0].text
2836 duration = float(last_type.findall('./duration')[0].text)
2838 video_url = 'http://video2.spiegel.de/flash/' + filename
2839 video_ext = filename.rpartition('.')[2]
2844 'title': video_title,
2845 'duration': duration,
2849 class LiveLeakIE(InfoExtractor):
2851 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2852 IE_NAME = u'liveleak'
2854 def _real_extract(self, url):
2855 mobj = re.match(self._VALID_URL, url)
2857 raise ExtractorError(u'Invalid URL: %s' % url)
2859 video_id = mobj.group('video_id')
2861 webpage = self._download_webpage(url, video_id)
2863 video_url = self._search_regex(r'file: "(.*?)",',
2864 webpage, u'video URL')
2866 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2867 webpage, u'title').replace('LiveLeak.com -', '').strip()
2869 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2870 webpage, u'description', fatal=False)
2872 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2873 webpage, u'uploader', fatal=False)
2879 'title': video_title,
2880 'description': video_description,
2881 'uploader': video_uploader
2886 class ARDIE(InfoExtractor):
2887 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2888 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2889 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2891 def _real_extract(self, url):
2892 # determine video id from url
2893 m = re.match(self._VALID_URL, url)
2895 numid = re.search(r'documentId=([0-9]+)', url)
2897 video_id = numid.group(1)
2899 video_id = m.group('video_id')
2901 # determine title and media streams from webpage
2902 html = self._download_webpage(url, video_id)
2903 title = re.search(self._TITLE, html).group('title')
2904 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2906 assert '"fsk"' in html
2907 raise ExtractorError(u'This video is only available after 8:00 pm')
2909 # choose default media type and highest quality for now
2910 stream = max([s for s in streams if int(s["media_type"]) == 0],
2911 key=lambda s: int(s["quality"]))
2913 # there's two possibilities: RTMP stream or HTTP download
2914 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2915 if stream['rtmp_url']:
2916 self.to_screen(u'RTMP download detected')
2917 assert stream['video_url'].startswith('mp4:')
2918 info["url"] = stream["rtmp_url"]
2919 info["play_path"] = stream['video_url']
2921 assert stream["video_url"].endswith('.mp4')
2922 info["url"] = stream["video_url"]
2925 class ZDFIE(InfoExtractor):
2926 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2927 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2928 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2929 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2930 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2932 def _real_extract(self, url):
2933 mobj = re.match(self._VALID_URL, url)
2935 raise ExtractorError(u'Invalid URL: %s' % url)
2936 video_id = mobj.group('video_id')
2938 html = self._download_webpage(url, video_id)
2939 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2941 raise ExtractorError(u'No media url found.')
2943 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2944 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2945 # choose first/default media type and highest quality for now
2946 for s in streams: #find 300 - dsl1000mbit
2947 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2950 for s in streams: #find veryhigh - dsl2000mbit
2951 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2955 raise ExtractorError(u'No stream found.')
2957 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2959 self.report_extraction(video_id)
2960 mobj = re.search(self._TITLE, html)
2962 raise ExtractorError(u'Cannot extract title')
2963 title = unescapeHTML(mobj.group('title'))
2965 mobj = re.search(self._MMS_STREAM, media_link)
2967 mobj = re.search(self._RTSP_STREAM, media_link)
2969 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2970 mms_url = mobj.group('video_url')
2972 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2974 raise ExtractorError(u'Cannot extract extention')
2975 ext = mobj.group('ext')
2977 return [{'id': video_id,
2983 class TumblrIE(InfoExtractor):
2984 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2986 def _real_extract(self, url):
2987 m_url = re.match(self._VALID_URL, url)
2988 video_id = m_url.group('id')
2989 blog = m_url.group('blog_name')
2991 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2992 webpage = self._download_webpage(url, video_id)
2994 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2995 video = re.search(re_video, webpage)
2997 raise ExtractorError(u'Unable to extract video')
2998 video_url = video.group('video_url')
2999 ext = video.group('ext')
3001 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
3002 webpage, u'thumbnail', fatal=False) # We pick the first poster
3003 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
3005 # The only place where you can get a title, it's not complete,
3006 # but searching in other places doesn't work for all videos
3007 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
3008 webpage, u'title', flags=re.DOTALL)
3010 return [{'id': video_id,
3012 'title': video_title,
3013 'thumbnail': video_thumbnail,
3017 class BandcampIE(InfoExtractor):
3018 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3020 def _real_extract(self, url):
3021 mobj = re.match(self._VALID_URL, url)
3022 title = mobj.group('title')
3023 webpage = self._download_webpage(url, title)
3024 # We get the link to the free download page
3025 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3026 if m_download is None:
3027 raise ExtractorError(u'No free songs found')
3029 download_link = m_download.group(1)
3030 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3031 webpage, re.MULTILINE|re.DOTALL).group('id')
3033 download_webpage = self._download_webpage(download_link, id,
3034 'Downloading free downloads page')
3035 # We get the dictionary of the track from some javascrip code
3036 info = re.search(r'items: (.*?),$',
3037 download_webpage, re.MULTILINE).group(1)
3038 info = json.loads(info)[0]
3039 # We pick mp3-320 for now, until format selection can be easily implemented.
3040 mp3_info = info[u'downloads'][u'mp3-320']
3041 # If we try to use this url it says the link has expired
3042 initial_url = mp3_info[u'url']
3043 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3044 m_url = re.match(re_url, initial_url)
3045 #We build the url we will use to get the final track url
3046 # This url is build in Bandcamp in the script download_bunde_*.js
3047 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3048 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3049 # If we could correctly generate the .rand field the url would be
3050 #in the "download_url" key
3051 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3053 track_info = {'id':id,
3054 'title' : info[u'title'],
3057 'thumbnail' : info[u'thumb_url'],
3058 'uploader' : info[u'artist']
3063 class RedTubeIE(InfoExtractor):
3064 """Information Extractor for redtube"""
3065 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3067 def _real_extract(self,url):
3068 mobj = re.match(self._VALID_URL, url)
3070 raise ExtractorError(u'Invalid URL: %s' % url)
3072 video_id = mobj.group('id')
3073 video_extension = 'mp4'
3074 webpage = self._download_webpage(url, video_id)
3076 self.report_extraction(video_id)
3078 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3079 webpage, u'video URL')
3081 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3087 'ext': video_extension,
3088 'title': video_title,
3091 class InaIE(InfoExtractor):
3092 """Information Extractor for Ina.fr"""
3093 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3095 def _real_extract(self,url):
3096 mobj = re.match(self._VALID_URL, url)
3098 video_id = mobj.group('id')
3099 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3100 video_extension = 'mp4'
3101 webpage = self._download_webpage(mrss_url, video_id)
3103 self.report_extraction(video_id)
3105 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
3106 webpage, u'video URL')
3108 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
3114 'ext': video_extension,
3115 'title': video_title,
3118 class HowcastIE(InfoExtractor):
3119 """Information Extractor for Howcast.com"""
3120 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
3122 def _real_extract(self, url):
3123 mobj = re.match(self._VALID_URL, url)
3125 video_id = mobj.group('id')
3126 webpage_url = 'http://www.howcast.com/videos/' + video_id
3127 webpage = self._download_webpage(webpage_url, video_id)
3129 self.report_extraction(video_id)
3131 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3132 webpage, u'video URL')
3134 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3137 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3138 webpage, u'description', fatal=False)
3140 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3141 webpage, u'thumbnail', fatal=False)
3147 'title': video_title,
3148 'description': video_description,
3149 'thumbnail': thumbnail,
3152 class VineIE(InfoExtractor):
3153 """Information Extractor for Vine.co"""
3154 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3156 def _real_extract(self, url):
3157 mobj = re.match(self._VALID_URL, url)
3159 video_id = mobj.group('id')
3160 webpage_url = 'https://vine.co/v/' + video_id
3161 webpage = self._download_webpage(webpage_url, video_id)
3163 self.report_extraction(video_id)
3165 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3166 webpage, u'video URL')
3168 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3171 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3172 webpage, u'thumbnail', fatal=False)
3174 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3175 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3181 'title': video_title,
3182 'thumbnail': thumbnail,
3183 'uploader': uploader,
3186 class FlickrIE(InfoExtractor):
3187 """Information Extractor for Flickr videos"""
3188 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3190 def _real_extract(self, url):
3191 mobj = re.match(self._VALID_URL, url)
3193 video_id = mobj.group('id')
3194 video_uploader_id = mobj.group('uploader_id')
3195 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3196 webpage = self._download_webpage(webpage_url, video_id)
3198 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3200 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3201 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3203 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3204 first_xml, u'node_id')
3206 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3207 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3209 self.report_extraction(video_id)
3211 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3213 raise ExtractorError(u'Unable to extract video url')
3214 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3216 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3217 webpage, u'video title')
3219 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3220 webpage, u'description', fatal=False)
3222 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3223 webpage, u'thumbnail', fatal=False)
3229 'title': video_title,
3230 'description': video_description,
3231 'thumbnail': thumbnail,
3232 'uploader_id': video_uploader_id,
3235 class TeamcocoIE(InfoExtractor):
3236 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3238 def _real_extract(self, url):
3239 mobj = re.match(self._VALID_URL, url)
3241 raise ExtractorError(u'Invalid URL: %s' % url)
3242 url_title = mobj.group('url_title')
3243 webpage = self._download_webpage(url, url_title)
3245 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3246 webpage, u'video id')
3248 self.report_extraction(video_id)
3250 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3253 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3254 webpage, u'thumbnail', fatal=False)
3256 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3257 webpage, u'description', fatal=False)
3259 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3260 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3262 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3269 'title': video_title,
3270 'thumbnail': thumbnail,
3271 'description': video_description,
3274 class XHamsterIE(InfoExtractor):
3275 """Information Extractor for xHamster"""
3276 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3278 def _real_extract(self,url):
3279 mobj = re.match(self._VALID_URL, url)
3281 video_id = mobj.group('id')
3282 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3283 webpage = self._download_webpage(mrss_url, video_id)
3285 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3287 raise ExtractorError(u'Unable to extract media URL')
3288 if len(mobj.group('server')) == 0:
3289 video_url = compat_urllib_parse.unquote(mobj.group('file'))
3291 video_url = mobj.group('server')+'/key='+mobj.group('file')
3292 video_extension = video_url.split('.')[-1]
3294 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3297 # Can't see the description anywhere in the UI
3298 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3299 # webpage, u'description', fatal=False)
3300 # if video_description: video_description = unescapeHTML(video_description)
3302 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3304 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3306 video_upload_date = None
3307 self._downloader.report_warning(u'Unable to extract upload date')
3309 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3310 webpage, u'uploader id', default=u'anonymous')
3312 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3313 webpage, u'thumbnail', fatal=False)
3318 'ext': video_extension,
3319 'title': video_title,
3320 # 'description': video_description,
3321 'upload_date': video_upload_date,
3322 'uploader_id': video_uploader_id,
3323 'thumbnail': video_thumbnail
3326 class HypemIE(InfoExtractor):
3327 """Information Extractor for hypem"""
3328 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3330 def _real_extract(self, url):
3331 mobj = re.match(self._VALID_URL, url)
3333 raise ExtractorError(u'Invalid URL: %s' % url)
3334 track_id = mobj.group(1)
3336 data = { 'ax': 1, 'ts': time.time() }
3337 data_encoded = compat_urllib_parse.urlencode(data)
3338 complete_url = url + "?" + data_encoded
3339 request = compat_urllib_request.Request(complete_url)
3340 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3341 cookie = urlh.headers.get('Set-Cookie', '')
3343 self.report_extraction(track_id)
3345 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3346 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3348 track_list = json.loads(html_tracks)
3349 track = track_list[u'tracks'][0]
3351 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3354 track_id = track[u"id"]
3355 artist = track[u"artist"]
3356 title = track[u"song"]
3358 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3359 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3360 request.add_header('cookie', cookie)
3361 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3363 song_data = json.loads(song_data_json)
3365 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3366 final_url = song_data[u"url"]
3376 class Vbox7IE(InfoExtractor):
3377 """Information Extractor for Vbox7"""
3378 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3380 def _real_extract(self,url):
3381 mobj = re.match(self._VALID_URL, url)
3383 raise ExtractorError(u'Invalid URL: %s' % url)
3384 video_id = mobj.group(1)
3386 redirect_page, urlh = self._download_webpage_handle(url, video_id)
3387 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3388 redirect_url = urlh.geturl() + new_location
3389 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3391 title = self._html_search_regex(r'<title>(.*)</title>',
3392 webpage, u'title').split('/')[0].strip()
3395 info_url = "http://vbox7.com/play/magare.do"
3396 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3397 info_request = compat_urllib_request.Request(info_url, data)
3398 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3399 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3400 if info_response is None:
3401 raise ExtractorError(u'Unable to extract the media url')
3402 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3409 'thumbnail': thumbnail_url,
3412 class GametrailersIE(InfoExtractor):
3413 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3415 def _real_extract(self, url):
3416 mobj = re.match(self._VALID_URL, url)
3418 raise ExtractorError(u'Invalid URL: %s' % url)
3419 video_id = mobj.group('id')
3420 video_type = mobj.group('type')
3421 webpage = self._download_webpage(url, video_id)
3422 if video_type == 'full-episodes':
3423 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3425 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3426 mgid = self._search_regex(mgid_re, webpage, u'mgid')
3427 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3429 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3430 video_id, u'Downloading video info')
3431 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3432 video_id, u'Downloading video urls info')
3434 self.report_extraction(video_id)
3435 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3436 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3438 <url>(?P<thumb>.*?)</url>.*
3441 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3443 raise ExtractorError(u'Unable to extract video info')
3444 video_title = m_info.group('title')
3445 video_description = m_info.group('description')
3446 video_thumb = m_info.group('thumb')
3448 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3449 if m_urls is None or len(m_urls) == 0:
3450 raise ExtractError(u'Unable to extrat video url')
3451 # They are sorted from worst to best quality
3452 video_url = m_urls[-1].group('url')
3454 return {'url': video_url,
3456 'title': video_title,
3457 # Videos are actually flv not mp4
3459 'thumbnail': video_thumb,
3460 'description': video_description,
3463 def gen_extractors():
3464 """ Return a list of an instance of every supported extractor.
3465 The order does matter; the first extractor matched is the one handling the URL.
3468 YoutubePlaylistIE(),
3493 StanfordOpenClassroomIE(),
3503 WorldStarHipHopIE(),
3533 def get_info_extractor(ie_name):
3534 """Returns the info extractor class with the given ie_name"""
3535 return globals()[ie_name+'IE']