10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.myvideo import MyVideoIE
37 from .extractor.statigram import StatigramIE
38 from .extractor.photobucket import PhotobucketIE
39 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
40 from .extractor.vimeo import VimeoIE
41 from .extractor.xvideos import XVideosIE
42 from .extractor.yahoo import YahooIE, YahooSearchIE
43 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
44 from .extractor.zdf import ZDFIE
48 class MixcloudIE(InfoExtractor):
49 """Information extractor for www.mixcloud.com"""
51 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
52 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
55 def report_download_json(self, file_id):
56 """Report JSON download."""
57 self.to_screen(u'Downloading json')
59 def get_urls(self, jsonData, fmt, bitrate='best'):
60 """Get urls from 'audio_formats' section in json"""
63 bitrate_list = jsonData[fmt]
64 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
65 bitrate = max(bitrate_list) # select highest
67 url_list = jsonData[fmt][bitrate]
68 except TypeError: # we have no bitrate info.
69 url_list = jsonData[fmt]
72 def check_urls(self, url_list):
73 """Returns 1st active url from list"""
76 compat_urllib_request.urlopen(url)
78 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
83 def _print_formats(self, formats):
84 print('Available formats:')
85 for fmt in formats.keys():
86 for b in formats[fmt]:
88 ext = formats[fmt][b][0]
89 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
90 except TypeError: # we have no bitrate info
92 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
95 def _real_extract(self, url):
96 mobj = re.match(self._VALID_URL, url)
98 raise ExtractorError(u'Invalid URL: %s' % url)
99 # extract uploader & filename from url
100 uploader = mobj.group(1).decode('utf-8')
101 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
103 # construct API request
104 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
105 # retrieve .json file with links to files
106 request = compat_urllib_request.Request(file_url)
108 self.report_download_json(file_url)
109 jsonData = compat_urllib_request.urlopen(request).read()
110 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
111 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
114 json_data = json.loads(jsonData)
115 player_url = json_data['player_swf_url']
116 formats = dict(json_data['audio_formats'])
118 req_format = self._downloader.params.get('format', None)
121 if self._downloader.params.get('listformats', None):
122 self._print_formats(formats)
125 if req_format is None or req_format == 'best':
126 for format_param in formats.keys():
127 url_list = self.get_urls(formats, format_param)
129 file_url = self.check_urls(url_list)
130 if file_url is not None:
133 if req_format not in formats:
134 raise ExtractorError(u'Format is not available')
136 url_list = self.get_urls(formats, req_format)
137 file_url = self.check_urls(url_list)
138 format_param = req_format
141 'id': file_id.decode('utf-8'),
142 'url': file_url.decode('utf-8'),
143 'uploader': uploader.decode('utf-8'),
145 'title': json_data['name'],
146 'ext': file_url.split('.')[-1].decode('utf-8'),
147 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
148 'thumbnail': json_data['thumbnail_url'],
149 'description': json_data['description'],
150 'player_url': player_url.decode('utf-8'),
153 class StanfordOpenClassroomIE(InfoExtractor):
154 """Information extractor for Stanford's Open ClassRoom"""
156 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
157 IE_NAME = u'stanfordoc'
159 def _real_extract(self, url):
160 mobj = re.match(self._VALID_URL, url)
162 raise ExtractorError(u'Invalid URL: %s' % url)
164 if mobj.group('course') and mobj.group('video'): # A specific video
165 course = mobj.group('course')
166 video = mobj.group('video')
168 'id': course + '_' + video,
173 self.report_extraction(info['id'])
174 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
175 xmlUrl = baseUrl + video + '.xml'
177 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
178 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
179 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
180 mdoc = xml.etree.ElementTree.fromstring(metaXml)
182 info['title'] = mdoc.findall('./title')[0].text
183 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
185 raise ExtractorError(u'Invalid metadata XML file')
186 info['ext'] = info['url'].rpartition('.')[2]
188 elif mobj.group('course'): # A course page
189 course = mobj.group('course')
197 coursepage = self._download_webpage(url, info['id'],
198 note='Downloading course info page',
199 errnote='Unable to download course info page')
201 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
203 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
204 coursepage, u'description', fatal=False)
206 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
210 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
214 for entry in info['list']:
215 assert entry['type'] == 'reference'
216 results += self.extract(entry['url'])
220 'id': 'Stanford OpenClassroom',
226 self.report_download_webpage(info['id'])
227 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
229 rootpage = compat_urllib_request.urlopen(rootURL).read()
230 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
231 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
233 info['title'] = info['id']
235 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
239 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
244 for entry in info['list']:
245 assert entry['type'] == 'reference'
246 results += self.extract(entry['url'])
249 class MTVIE(InfoExtractor):
250 """Information extractor for MTV.com"""
252 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
255 def _real_extract(self, url):
256 mobj = re.match(self._VALID_URL, url)
258 raise ExtractorError(u'Invalid URL: %s' % url)
259 if not mobj.group('proto'):
260 url = 'http://' + url
261 video_id = mobj.group('videoid')
263 webpage = self._download_webpage(url, video_id)
265 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
266 webpage, u'song name', fatal=False)
268 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
271 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
272 webpage, u'mtvn_uri', fatal=False)
274 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
275 webpage, u'content id', fatal=False)
277 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
278 self.report_extraction(video_id)
279 request = compat_urllib_request.Request(videogen_url)
281 metadataXml = compat_urllib_request.urlopen(request).read()
282 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
283 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
285 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
286 renditions = mdoc.findall('.//rendition')
288 # For now, always pick the highest quality.
289 rendition = renditions[-1]
292 _,_,ext = rendition.attrib['type'].partition('/')
293 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
294 video_url = rendition.find('./src').text
296 raise ExtractorError('Invalid rendition field.')
301 'uploader': performer,
303 'title': video_title,
311 class YoukuIE(InfoExtractor):
312 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
315 nowTime = int(time.time() * 1000)
316 random1 = random.randint(1000,1998)
317 random2 = random.randint(1000,9999)
319 return "%d%d%d" %(nowTime,random1,random2)
321 def _get_file_ID_mix_string(self, seed):
323 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
325 for i in range(len(source)):
326 seed = (seed * 211 + 30031 ) % 65536
327 index = math.floor(seed / 65536 * len(source) )
328 mixed.append(source[int(index)])
329 source.remove(source[int(index)])
330 #return ''.join(mixed)
333 def _get_file_id(self, fileId, seed):
334 mixed = self._get_file_ID_mix_string(seed)
335 ids = fileId.split('*')
339 realId.append(mixed[int(ch)])
340 return ''.join(realId)
342 def _real_extract(self, url):
343 mobj = re.match(self._VALID_URL, url)
345 raise ExtractorError(u'Invalid URL: %s' % url)
346 video_id = mobj.group('ID')
348 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
350 jsondata = self._download_webpage(info_url, video_id)
352 self.report_extraction(video_id)
354 config = json.loads(jsondata)
356 video_title = config['data'][0]['title']
357 seed = config['data'][0]['seed']
359 format = self._downloader.params.get('format', None)
360 supported_format = list(config['data'][0]['streamfileids'].keys())
362 if format is None or format == 'best':
363 if 'hd2' in supported_format:
368 elif format == 'worst':
376 fileid = config['data'][0]['streamfileids'][format]
377 keys = [s['k'] for s in config['data'][0]['segs'][format]]
378 except (UnicodeDecodeError, ValueError, KeyError):
379 raise ExtractorError(u'Unable to extract info section')
382 sid = self._gen_sid()
383 fileid = self._get_file_id(fileid, seed)
385 #column 8,9 of fileid represent the segment number
386 #fileid[7:9] should be changed
387 for index, key in enumerate(keys):
389 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
390 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
393 'id': '%s_part%02d' % (video_id, index),
397 'title': video_title,
400 files_info.append(info)
405 class XNXXIE(InfoExtractor):
406 """Information extractor for xnxx.com"""
408 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
410 VIDEO_URL_RE = r'flv_url=(.*?)&'
411 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
412 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
414 def _real_extract(self, url):
415 mobj = re.match(self._VALID_URL, url)
417 raise ExtractorError(u'Invalid URL: %s' % url)
418 video_id = mobj.group(1)
420 # Get webpage content
421 webpage = self._download_webpage(url, video_id)
423 video_url = self._search_regex(self.VIDEO_URL_RE,
424 webpage, u'video URL')
425 video_url = compat_urllib_parse.unquote(video_url)
427 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
430 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
431 webpage, u'thumbnail', fatal=False)
438 'title': video_title,
440 'thumbnail': video_thumbnail,
446 class NBAIE(InfoExtractor):
447 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
450 def _real_extract(self, url):
451 mobj = re.match(self._VALID_URL, url)
453 raise ExtractorError(u'Invalid URL: %s' % url)
455 video_id = mobj.group(1)
457 webpage = self._download_webpage(url, video_id)
459 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
461 shortened_video_id = video_id.rpartition('/')[2]
462 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
463 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
465 # It isn't there in the HTML it returns to us
466 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
468 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
471 'id': shortened_video_id,
475 # 'uploader_date': uploader_date,
476 'description': description,
480 class JustinTVIE(InfoExtractor):
481 """Information extractor for justin.tv and twitch.tv"""
482 # TODO: One broadcast may be split into multiple videos. The key
483 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
484 # starts at 1 and increases. Can we treat all parts as one video?
486 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
488 (?P<channelid>[^/]+)|
489 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
490 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
494 _JUSTIN_PAGE_LIMIT = 100
495 IE_NAME = u'justin.tv'
497 def report_download_page(self, channel, offset):
498 """Report attempt to download a single page of videos."""
499 self.to_screen(u'%s: Downloading video information from %d to %d' %
500 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
502 # Return count of items, list of *valid* items
503 def _parse_page(self, url, video_id):
504 webpage = self._download_webpage(url, video_id,
505 u'Downloading video info JSON',
506 u'unable to download video info JSON')
508 response = json.loads(webpage)
509 if type(response) != list:
510 error_text = response.get('error', 'unknown error')
511 raise ExtractorError(u'Justin.tv API: %s' % error_text)
513 for clip in response:
514 video_url = clip['video_file_url']
516 video_extension = os.path.splitext(video_url)[1][1:]
517 video_date = re.sub('-', '', clip['start_time'][:10])
518 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
519 video_id = clip['id']
520 video_title = clip.get('title', video_id)
524 'title': video_title,
525 'uploader': clip.get('channel_name', video_uploader_id),
526 'uploader_id': video_uploader_id,
527 'upload_date': video_date,
528 'ext': video_extension,
530 return (len(response), info)
532 def _real_extract(self, url):
533 mobj = re.match(self._VALID_URL, url)
535 raise ExtractorError(u'invalid URL: %s' % url)
537 api_base = 'http://api.justin.tv'
539 if mobj.group('channelid'):
541 video_id = mobj.group('channelid')
542 api = api_base + '/channel/archives/%s.json' % video_id
543 elif mobj.group('chapterid'):
544 chapter_id = mobj.group('chapterid')
546 webpage = self._download_webpage(url, chapter_id)
547 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
549 raise ExtractorError(u'Cannot find archive of a chapter')
550 archive_id = m.group(1)
552 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
553 chapter_info_xml = self._download_webpage(api, chapter_id,
554 note=u'Downloading chapter information',
555 errnote=u'Chapter information download failed')
556 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
557 for a in doc.findall('.//archive'):
558 if archive_id == a.find('./id').text:
561 raise ExtractorError(u'Could not find chapter in chapter information')
563 video_url = a.find('./video_file_url').text
564 video_ext = video_url.rpartition('.')[2] or u'flv'
566 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
567 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
568 note='Downloading chapter metadata',
569 errnote='Download of chapter metadata failed')
570 chapter_info = json.loads(chapter_info_json)
572 bracket_start = int(doc.find('.//bracket_start').text)
573 bracket_end = int(doc.find('.//bracket_end').text)
575 # TODO determine start (and probably fix up file)
576 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
577 #video_url += u'?start=' + TODO:start_timestamp
578 # bracket_start is 13290, but we want 51670615
579 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
580 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
583 'id': u'c' + chapter_id,
586 'title': chapter_info['title'],
587 'thumbnail': chapter_info['preview'],
588 'description': chapter_info['description'],
589 'uploader': chapter_info['channel']['display_name'],
590 'uploader_id': chapter_info['channel']['name'],
594 video_id = mobj.group('videoid')
595 api = api_base + '/broadcast/by_archive/%s.json' % video_id
597 self.report_extraction(video_id)
601 limit = self._JUSTIN_PAGE_LIMIT
604 self.report_download_page(video_id, offset)
605 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
606 page_count, page_info = self._parse_page(page_url, video_id)
607 info.extend(page_info)
608 if not paged or page_count != limit:
613 class FunnyOrDieIE(InfoExtractor):
614 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
616 def _real_extract(self, url):
617 mobj = re.match(self._VALID_URL, url)
619 raise ExtractorError(u'invalid URL: %s' % url)
621 video_id = mobj.group('id')
622 webpage = self._download_webpage(url, video_id)
624 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
625 webpage, u'video URL', flags=re.DOTALL)
627 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
628 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
630 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
631 webpage, u'description', fatal=False, flags=re.DOTALL)
638 'description': video_description,
642 class SteamIE(InfoExtractor):
643 _VALID_URL = r"""http://store\.steampowered\.com/
645 (?P<urltype>video|app)/ #If the page is only for videos or for a game
647 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
649 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
650 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
653 def suitable(cls, url):
654 """Receives a URL and returns True if suitable for this IE."""
655 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
657 def _real_extract(self, url):
658 m = re.match(self._VALID_URL, url, re.VERBOSE)
659 gameID = m.group('gameID')
661 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
662 webpage = self._download_webpage(videourl, gameID)
664 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
665 videourl = self._AGECHECK_TEMPLATE % gameID
666 self.report_age_confirmation()
667 webpage = self._download_webpage(videourl, gameID)
669 self.report_extraction(gameID)
670 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
671 webpage, 'game title')
673 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
674 mweb = re.finditer(urlRE, webpage)
675 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
676 titles = re.finditer(namesRE, webpage)
677 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
678 thumbs = re.finditer(thumbsRE, webpage)
680 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
681 video_id = vid.group('videoID')
682 title = vtitle.group('videoName')
683 video_url = vid.group('videoURL')
684 video_thumb = thumb.group('thumbnail')
686 raise ExtractorError(u'Cannot find video url for %s' % video_id)
691 'title': unescapeHTML(title),
692 'thumbnail': video_thumb
695 return [self.playlist_result(videos, gameID, game_title)]
697 class UstreamIE(InfoExtractor):
698 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
701 def _real_extract(self, url):
702 m = re.match(self._VALID_URL, url)
703 video_id = m.group('videoID')
705 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
706 webpage = self._download_webpage(url, video_id)
708 self.report_extraction(video_id)
710 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
713 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
714 webpage, u'uploader', fatal=False, flags=re.DOTALL)
716 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
717 webpage, u'thumbnail', fatal=False)
723 'title': video_title,
724 'uploader': uploader,
725 'thumbnail': thumbnail,
729 class WorldStarHipHopIE(InfoExtractor):
730 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
731 IE_NAME = u'WorldStarHipHop'
733 def _real_extract(self, url):
734 m = re.match(self._VALID_URL, url)
735 video_id = m.group('id')
737 webpage_src = self._download_webpage(url, video_id)
739 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
740 webpage_src, u'video URL')
742 if 'mp4' in video_url:
747 video_title = self._html_search_regex(r"<title>(.*)</title>",
748 webpage_src, u'title')
750 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
751 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
752 webpage_src, u'thumbnail', fatal=False)
755 _title = r"""candytitles.*>(.*)</span>"""
756 mobj = re.search(_title, webpage_src)
758 video_title = mobj.group(1)
763 'title' : video_title,
764 'thumbnail' : thumbnail,
769 class RBMARadioIE(InfoExtractor):
770 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
772 def _real_extract(self, url):
773 m = re.match(self._VALID_URL, url)
774 video_id = m.group('videoID')
776 webpage = self._download_webpage(url, video_id)
778 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
779 webpage, u'json data', flags=re.MULTILINE)
782 data = json.loads(json_data)
783 except ValueError as e:
784 raise ExtractorError(u'Invalid JSON: ' + str(e))
786 video_url = data['akamai_url'] + '&cbr=256'
787 url_parts = compat_urllib_parse_urlparse(video_url)
788 video_ext = url_parts.path.rpartition('.')[2]
793 'title': data['title'],
794 'description': data.get('teaser_text'),
795 'location': data.get('country_of_origin'),
796 'uploader': data.get('host', {}).get('name'),
797 'uploader_id': data.get('host', {}).get('slug'),
798 'thumbnail': data.get('image', {}).get('large_url_2x'),
799 'duration': data.get('duration'),
804 class YouPornIE(InfoExtractor):
805 """Information extractor for youporn.com."""
806 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
808 def _print_formats(self, formats):
809 """Print all available formats"""
810 print(u'Available formats:')
811 print(u'ext\t\tformat')
812 print(u'---------------------------------')
813 for format in formats:
814 print(u'%s\t\t%s' % (format['ext'], format['format']))
816 def _specific(self, req_format, formats):
818 if(x["format"]==req_format):
822 def _real_extract(self, url):
823 mobj = re.match(self._VALID_URL, url)
825 raise ExtractorError(u'Invalid URL: %s' % url)
826 video_id = mobj.group('videoid')
828 req = compat_urllib_request.Request(url)
829 req.add_header('Cookie', 'age_verified=1')
830 webpage = self._download_webpage(req, video_id)
832 # Get JSON parameters
833 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
835 params = json.loads(json_params)
837 raise ExtractorError(u'Invalid JSON')
839 self.report_extraction(video_id)
841 video_title = params['title']
842 upload_date = unified_strdate(params['release_date_f'])
843 video_description = params['description']
844 video_uploader = params['submitted_by']
845 thumbnail = params['thumbnails'][0]['image']
847 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
849 # Get all of the formats available
850 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
851 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
852 webpage, u'download list').strip()
854 # Get all of the links from the page
855 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
856 links = re.findall(LINK_RE, download_list_html)
858 raise ExtractorError(u'ERROR: no known formats available for video')
860 self.to_screen(u'Links found: %d' % len(links))
865 # A link looks like this:
866 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
867 # A path looks like this:
868 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
869 video_url = unescapeHTML( link )
870 path = compat_urllib_parse_urlparse( video_url ).path
871 extension = os.path.splitext( path )[1][1:]
872 format = path.split('/')[4].split('_')[:2]
875 format = "-".join( format )
876 # title = u'%s-%s-%s' % (video_title, size, bitrate)
881 'uploader': video_uploader,
882 'upload_date': upload_date,
883 'title': video_title,
886 'thumbnail': thumbnail,
887 'description': video_description
890 if self._downloader.params.get('listformats', None):
891 self._print_formats(formats)
894 req_format = self._downloader.params.get('format', None)
895 self.to_screen(u'Format: %s' % req_format)
897 if req_format is None or req_format == 'best':
899 elif req_format == 'worst':
901 elif req_format in ('-1', 'all'):
904 format = self._specific( req_format, formats )
906 raise ExtractorError(u'Requested format not available')
911 class PornotubeIE(InfoExtractor):
912 """Information extractor for pornotube.com."""
913 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
915 def _real_extract(self, url):
916 mobj = re.match(self._VALID_URL, url)
918 raise ExtractorError(u'Invalid URL: %s' % url)
920 video_id = mobj.group('videoid')
921 video_title = mobj.group('title')
923 # Get webpage content
924 webpage = self._download_webpage(url, video_id)
927 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
928 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
929 video_url = compat_urllib_parse.unquote(video_url)
931 #Get the uploaded date
932 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
933 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
934 if upload_date: upload_date = unified_strdate(upload_date)
936 info = {'id': video_id,
939 'upload_date': upload_date,
940 'title': video_title,
946 class YouJizzIE(InfoExtractor):
947 """Information extractor for youjizz.com."""
948 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
950 def _real_extract(self, url):
951 mobj = re.match(self._VALID_URL, url)
953 raise ExtractorError(u'Invalid URL: %s' % url)
955 video_id = mobj.group('videoid')
957 # Get webpage content
958 webpage = self._download_webpage(url, video_id)
960 # Get the video title
961 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
962 webpage, u'title').strip()
965 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
967 raise ExtractorError(u'ERROR: unable to extract embed page')
969 embed_page_url = result.group(0).strip()
970 video_id = result.group('videoid')
972 webpage = self._download_webpage(embed_page_url, video_id)
975 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
976 webpage, u'video URL')
978 info = {'id': video_id,
980 'title': video_title,
983 'player_url': embed_page_url}
987 class EightTracksIE(InfoExtractor):
989 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
991 def _real_extract(self, url):
992 mobj = re.match(self._VALID_URL, url)
994 raise ExtractorError(u'Invalid URL: %s' % url)
995 playlist_id = mobj.group('id')
997 webpage = self._download_webpage(url, playlist_id)
999 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1000 data = json.loads(json_like)
1002 session = str(random.randint(0, 1000000000))
1004 track_count = data['tracks_count']
1005 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1006 next_url = first_url
1008 for i in itertools.count():
1009 api_json = self._download_webpage(next_url, playlist_id,
1010 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1011 errnote=u'Failed to download song information')
1012 api_data = json.loads(api_json)
1013 track_data = api_data[u'set']['track']
1015 'id': track_data['id'],
1016 'url': track_data['track_file_stream_url'],
1017 'title': track_data['performer'] + u' - ' + track_data['name'],
1018 'raw_title': track_data['name'],
1019 'uploader_id': data['user']['login'],
1023 if api_data['set']['at_last_track']:
1025 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1028 class KeekIE(InfoExtractor):
1029 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1032 def _real_extract(self, url):
1033 m = re.match(self._VALID_URL, url)
1034 video_id = m.group('videoID')
1036 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1037 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1038 webpage = self._download_webpage(url, video_id)
1040 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1043 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1044 webpage, u'uploader', fatal=False)
1050 'title': video_title,
1051 'thumbnail': thumbnail,
1052 'uploader': uploader
1056 class TEDIE(InfoExtractor):
1057 _VALID_URL=r'''http://www\.ted\.com/
1059 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1061 ((?P<type_talk>talks)) # We have a simple talk
1063 (/lang/(.*?))? # The url may contain the language
1064 /(?P<name>\w+) # Here goes the name and then ".html"
1068 def suitable(cls, url):
1069 """Receives a URL and returns True if suitable for this IE."""
1070 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1072 def _real_extract(self, url):
1073 m=re.match(self._VALID_URL, url, re.VERBOSE)
1074 if m.group('type_talk'):
1075 return [self._talk_info(url)]
1077 playlist_id=m.group('playlist_id')
1078 name=m.group('name')
1079 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1080 return [self._playlist_videos_info(url,name,playlist_id)]
1082 def _playlist_videos_info(self,url,name,playlist_id=0):
1083 '''Returns the videos of the playlist'''
1085 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1086 ([.\s]*?)data-playlist_item_id="(\d+)"
1087 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1089 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1090 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1091 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1092 m_names=re.finditer(video_name_RE,webpage)
1094 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1095 webpage, 'playlist title')
1097 playlist_entries = []
1098 for m_video, m_name in zip(m_videos,m_names):
1099 video_id=m_video.group('video_id')
1100 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1101 playlist_entries.append(self.url_result(talk_url, 'TED'))
1102 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1104 def _talk_info(self, url, video_id=0):
1105 """Return the video for the talk in the url"""
1106 m = re.match(self._VALID_URL, url,re.VERBOSE)
1107 video_name = m.group('name')
1108 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1109 self.report_extraction(video_name)
1110 # If the url includes the language we get the title translated
1111 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1113 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1114 webpage, 'json data')
1115 info = json.loads(json_data)
1116 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1117 webpage, 'description', flags = re.DOTALL)
1119 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1120 webpage, 'thumbnail')
1123 'url': info['htmlStreams'][-1]['file'],
1126 'thumbnail': thumbnail,
1127 'description': desc,
1131 class MySpassIE(InfoExtractor):
1132 _VALID_URL = r'http://www.myspass.de/.*'
1134 def _real_extract(self, url):
1135 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1137 # video id is the last path element of the URL
1138 # usually there is a trailing slash, so also try the second but last
1139 url_path = compat_urllib_parse_urlparse(url).path
1140 url_parent_path, video_id = os.path.split(url_path)
1142 _, video_id = os.path.split(url_parent_path)
1145 metadata_url = META_DATA_URL_TEMPLATE % video_id
1146 metadata_text = self._download_webpage(metadata_url, video_id)
1147 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1149 # extract values from metadata
1150 url_flv_el = metadata.find('url_flv')
1151 if url_flv_el is None:
1152 raise ExtractorError(u'Unable to extract download url')
1153 video_url = url_flv_el.text
1154 extension = os.path.splitext(video_url)[1][1:]
1155 title_el = metadata.find('title')
1156 if title_el is None:
1157 raise ExtractorError(u'Unable to extract title')
1158 title = title_el.text
1159 format_id_el = metadata.find('format_id')
1160 if format_id_el is None:
1163 format = format_id_el.text
1164 description_el = metadata.find('description')
1165 if description_el is not None:
1166 description = description_el.text
1169 imagePreview_el = metadata.find('imagePreview')
1170 if imagePreview_el is not None:
1171 thumbnail = imagePreview_el.text
1180 'thumbnail': thumbnail,
1181 'description': description
1185 class SpiegelIE(InfoExtractor):
1186 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1188 def _real_extract(self, url):
1189 m = re.match(self._VALID_URL, url)
1190 video_id = m.group('videoID')
1192 webpage = self._download_webpage(url, video_id)
1194 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1197 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1198 xml_code = self._download_webpage(xml_url, video_id,
1199 note=u'Downloading XML', errnote=u'Failed to download XML')
1201 idoc = xml.etree.ElementTree.fromstring(xml_code)
1202 last_type = idoc[-1]
1203 filename = last_type.findall('./filename')[0].text
1204 duration = float(last_type.findall('./duration')[0].text)
1206 video_url = 'http://video2.spiegel.de/flash/' + filename
1207 video_ext = filename.rpartition('.')[2]
1212 'title': video_title,
1213 'duration': duration,
1217 class LiveLeakIE(InfoExtractor):
1219 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1220 IE_NAME = u'liveleak'
1222 def _real_extract(self, url):
1223 mobj = re.match(self._VALID_URL, url)
1225 raise ExtractorError(u'Invalid URL: %s' % url)
1227 video_id = mobj.group('video_id')
1229 webpage = self._download_webpage(url, video_id)
1231 video_url = self._search_regex(r'file: "(.*?)",',
1232 webpage, u'video URL')
1234 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1235 webpage, u'title').replace('LiveLeak.com -', '').strip()
1237 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1238 webpage, u'description', fatal=False)
1240 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1241 webpage, u'uploader', fatal=False)
1247 'title': video_title,
1248 'description': video_description,
1249 'uploader': video_uploader
1256 class TumblrIE(InfoExtractor):
1257 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1259 def _real_extract(self, url):
1260 m_url = re.match(self._VALID_URL, url)
1261 video_id = m_url.group('id')
1262 blog = m_url.group('blog_name')
1264 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1265 webpage = self._download_webpage(url, video_id)
1267 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1268 video = re.search(re_video, webpage)
1270 raise ExtractorError(u'Unable to extract video')
1271 video_url = video.group('video_url')
1272 ext = video.group('ext')
1274 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1275 webpage, u'thumbnail', fatal=False) # We pick the first poster
1276 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1278 # The only place where you can get a title, it's not complete,
1279 # but searching in other places doesn't work for all videos
1280 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1281 webpage, u'title', flags=re.DOTALL)
1283 return [{'id': video_id,
1285 'title': video_title,
1286 'thumbnail': video_thumbnail,
1290 class BandcampIE(InfoExtractor):
1291 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1293 def _real_extract(self, url):
1294 mobj = re.match(self._VALID_URL, url)
1295 title = mobj.group('title')
1296 webpage = self._download_webpage(url, title)
1297 # We get the link to the free download page
1298 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1299 if m_download is None:
1300 raise ExtractorError(u'No free songs found')
1302 download_link = m_download.group(1)
1303 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1304 webpage, re.MULTILINE|re.DOTALL).group('id')
1306 download_webpage = self._download_webpage(download_link, id,
1307 'Downloading free downloads page')
1308 # We get the dictionary of the track from some javascrip code
1309 info = re.search(r'items: (.*?),$',
1310 download_webpage, re.MULTILINE).group(1)
1311 info = json.loads(info)[0]
1312 # We pick mp3-320 for now, until format selection can be easily implemented.
1313 mp3_info = info[u'downloads'][u'mp3-320']
1314 # If we try to use this url it says the link has expired
1315 initial_url = mp3_info[u'url']
1316 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1317 m_url = re.match(re_url, initial_url)
1318 #We build the url we will use to get the final track url
1319 # This url is build in Bandcamp in the script download_bunde_*.js
1320 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1321 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1322 # If we could correctly generate the .rand field the url would be
1323 #in the "download_url" key
1324 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1326 track_info = {'id':id,
1327 'title' : info[u'title'],
1330 'thumbnail' : info[u'thumb_url'],
1331 'uploader' : info[u'artist']
1336 class RedTubeIE(InfoExtractor):
1337 """Information Extractor for redtube"""
1338 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1340 def _real_extract(self,url):
1341 mobj = re.match(self._VALID_URL, url)
1343 raise ExtractorError(u'Invalid URL: %s' % url)
1345 video_id = mobj.group('id')
1346 video_extension = 'mp4'
1347 webpage = self._download_webpage(url, video_id)
1349 self.report_extraction(video_id)
1351 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1352 webpage, u'video URL')
1354 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1360 'ext': video_extension,
1361 'title': video_title,
1364 class InaIE(InfoExtractor):
1365 """Information Extractor for Ina.fr"""
1366 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1368 def _real_extract(self,url):
1369 mobj = re.match(self._VALID_URL, url)
1371 video_id = mobj.group('id')
1372 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1373 video_extension = 'mp4'
1374 webpage = self._download_webpage(mrss_url, video_id)
1376 self.report_extraction(video_id)
1378 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1379 webpage, u'video URL')
1381 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1387 'ext': video_extension,
1388 'title': video_title,
1391 class HowcastIE(InfoExtractor):
1392 """Information Extractor for Howcast.com"""
1393 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1395 def _real_extract(self, url):
1396 mobj = re.match(self._VALID_URL, url)
1398 video_id = mobj.group('id')
1399 webpage_url = 'http://www.howcast.com/videos/' + video_id
1400 webpage = self._download_webpage(webpage_url, video_id)
1402 self.report_extraction(video_id)
1404 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1405 webpage, u'video URL')
1407 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1410 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1411 webpage, u'description', fatal=False)
1413 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1414 webpage, u'thumbnail', fatal=False)
1420 'title': video_title,
1421 'description': video_description,
1422 'thumbnail': thumbnail,
1425 class VineIE(InfoExtractor):
1426 """Information Extractor for Vine.co"""
1427 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1429 def _real_extract(self, url):
1430 mobj = re.match(self._VALID_URL, url)
1432 video_id = mobj.group('id')
1433 webpage_url = 'https://vine.co/v/' + video_id
1434 webpage = self._download_webpage(webpage_url, video_id)
1436 self.report_extraction(video_id)
1438 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1439 webpage, u'video URL')
1441 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1444 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1445 webpage, u'thumbnail', fatal=False)
1447 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1448 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1454 'title': video_title,
1455 'thumbnail': thumbnail,
1456 'uploader': uploader,
1459 class FlickrIE(InfoExtractor):
1460 """Information Extractor for Flickr videos"""
1461 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1463 def _real_extract(self, url):
1464 mobj = re.match(self._VALID_URL, url)
1466 video_id = mobj.group('id')
1467 video_uploader_id = mobj.group('uploader_id')
1468 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1469 webpage = self._download_webpage(webpage_url, video_id)
1471 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1473 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1474 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1476 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1477 first_xml, u'node_id')
1479 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1480 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1482 self.report_extraction(video_id)
1484 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1486 raise ExtractorError(u'Unable to extract video url')
1487 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1489 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1490 webpage, u'video title')
1492 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1493 webpage, u'description', fatal=False)
1495 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1496 webpage, u'thumbnail', fatal=False)
1502 'title': video_title,
1503 'description': video_description,
1504 'thumbnail': thumbnail,
1505 'uploader_id': video_uploader_id,
1508 class TeamcocoIE(InfoExtractor):
1509 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1511 def _real_extract(self, url):
1512 mobj = re.match(self._VALID_URL, url)
1514 raise ExtractorError(u'Invalid URL: %s' % url)
1515 url_title = mobj.group('url_title')
1516 webpage = self._download_webpage(url, url_title)
1518 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1519 webpage, u'video id')
1521 self.report_extraction(video_id)
1523 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1526 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1527 webpage, u'thumbnail', fatal=False)
1529 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1530 webpage, u'description', fatal=False)
1532 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1533 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1535 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1542 'title': video_title,
1543 'thumbnail': thumbnail,
1544 'description': video_description,
1547 class XHamsterIE(InfoExtractor):
1548 """Information Extractor for xHamster"""
1549 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1551 def _real_extract(self,url):
1552 mobj = re.match(self._VALID_URL, url)
1554 video_id = mobj.group('id')
1555 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1556 webpage = self._download_webpage(mrss_url, video_id)
1558 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1560 raise ExtractorError(u'Unable to extract media URL')
1561 if len(mobj.group('server')) == 0:
1562 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1564 video_url = mobj.group('server')+'/key='+mobj.group('file')
1565 video_extension = video_url.split('.')[-1]
1567 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1570 # Can't see the description anywhere in the UI
1571 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1572 # webpage, u'description', fatal=False)
1573 # if video_description: video_description = unescapeHTML(video_description)
1575 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1577 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1579 video_upload_date = None
1580 self._downloader.report_warning(u'Unable to extract upload date')
1582 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1583 webpage, u'uploader id', default=u'anonymous')
1585 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1586 webpage, u'thumbnail', fatal=False)
1591 'ext': video_extension,
1592 'title': video_title,
1593 # 'description': video_description,
1594 'upload_date': video_upload_date,
1595 'uploader_id': video_uploader_id,
1596 'thumbnail': video_thumbnail
1599 class HypemIE(InfoExtractor):
1600 """Information Extractor for hypem"""
1601 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1603 def _real_extract(self, url):
1604 mobj = re.match(self._VALID_URL, url)
1606 raise ExtractorError(u'Invalid URL: %s' % url)
1607 track_id = mobj.group(1)
1609 data = { 'ax': 1, 'ts': time.time() }
1610 data_encoded = compat_urllib_parse.urlencode(data)
1611 complete_url = url + "?" + data_encoded
1612 request = compat_urllib_request.Request(complete_url)
1613 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1614 cookie = urlh.headers.get('Set-Cookie', '')
1616 self.report_extraction(track_id)
1618 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1619 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1621 track_list = json.loads(html_tracks)
1622 track = track_list[u'tracks'][0]
1624 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1627 track_id = track[u"id"]
1628 artist = track[u"artist"]
1629 title = track[u"song"]
1631 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1632 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1633 request.add_header('cookie', cookie)
1634 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1636 song_data = json.loads(song_data_json)
1638 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1639 final_url = song_data[u"url"]
1649 class Vbox7IE(InfoExtractor):
1650 """Information Extractor for Vbox7"""
1651 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1653 def _real_extract(self,url):
1654 mobj = re.match(self._VALID_URL, url)
1656 raise ExtractorError(u'Invalid URL: %s' % url)
1657 video_id = mobj.group(1)
1659 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1660 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1661 redirect_url = urlh.geturl() + new_location
1662 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1664 title = self._html_search_regex(r'<title>(.*)</title>',
1665 webpage, u'title').split('/')[0].strip()
1668 info_url = "http://vbox7.com/play/magare.do"
1669 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1670 info_request = compat_urllib_request.Request(info_url, data)
1671 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1672 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1673 if info_response is None:
1674 raise ExtractorError(u'Unable to extract the media url')
1675 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1682 'thumbnail': thumbnail_url,
1686 def gen_extractors():
1687 """ Return a list of an instance of every supported extractor.
1688 The order does matter; the first extractor matched is the one handling the URL.
1691 YoutubePlaylistIE(),
1716 StanfordOpenClassroomIE(),
1726 WorldStarHipHopIE(),
1756 def get_info_extractor(ie_name):
1757 """Returns the info extractor class with the given ie_name"""
1758 return globals()[ie_name+'IE']