[flickr] Move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.flickr import FlickrIE
33 from .extractor.funnyordie import FunnyOrDieIE
34 from .extractor.gametrailers import GametrailersIE
35 from .extractor.generic import GenericIE
36 from .extractor.googleplus import GooglePlusIE
37 from .extractor.googlesearch import GoogleSearchIE
38 from .extractor.howcast import HowcastIE
39 from .extractor.hypem import HypemIE
40 from .extractor.ina import InaIE
41 from .extractor.infoq import InfoQIE
42 from .extractor.justintv import JustinTVIE
43 from .extractor.keek import KeekIE
44 from .extractor.liveleak import LiveLeakIE
45 from .extractor.metacafe import MetacafeIE
46 from .extractor.mixcloud import MixcloudIE
47 from .extractor.mtv import MTVIE
48 from .extractor.myspass import MySpassIE
49 from .extractor.myvideo import MyVideoIE
50 from .extractor.nba import NBAIE
51 from .extractor.statigram import StatigramIE
52 from .extractor.photobucket import PhotobucketIE
53 from .extractor.pornotube import PornotubeIE
54 from .extractor.rbmaradio import RBMARadioIE
55 from .extractor.redtube import RedTubeIE
56 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
57 from .extractor.spiegel import SpiegelIE
58 from .extractor.stanfordoc import StanfordOpenClassroomIE
59 from .extractor.steam import SteamIE
60 from .extractor.ted import TEDIE
61 from .extractor.tumblr import TumblrIE
62 from .extractor.ustream import UstreamIE
63 from .extractor.vbox7 import Vbox7IE
64 from .extractor.vimeo import VimeoIE
65 from .extractor.vine import VineIE
66 from .extractor.worldstarhiphop import WorldStarHipHopIE
67 from .extractor.xnxx import XNXXIE
68 from .extractor.xvideos import XVideosIE
69 from .extractor.yahoo import YahooIE, YahooSearchIE
70 from .extractor.youjizz import YouJizzIE
71 from .extractor.youku import YoukuIE
72 from .extractor.youporn import YouPornIE
73 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
74 from .extractor.zdf import ZDFIE
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114 class TeamcocoIE(InfoExtractor):
115     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
116
117     def _real_extract(self, url):
118         mobj = re.match(self._VALID_URL, url)
119         if mobj is None:
120             raise ExtractorError(u'Invalid URL: %s' % url)
121         url_title = mobj.group('url_title')
122         webpage = self._download_webpage(url, url_title)
123
124         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
125             webpage, u'video id')
126
127         self.report_extraction(video_id)
128
129         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
130             webpage, u'title')
131
132         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
133             webpage, u'thumbnail', fatal=False)
134
135         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
136             webpage, u'description', fatal=False)
137
138         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
139         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
140
141         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
142             data, u'video URL')
143
144         return [{
145             'id':          video_id,
146             'url':         video_url,
147             'ext':         'mp4',
148             'title':       video_title,
149             'thumbnail':   thumbnail,
150             'description': video_description,
151         }]
152
153 class XHamsterIE(InfoExtractor):
154     """Information Extractor for xHamster"""
155     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
156
157     def _real_extract(self,url):
158         mobj = re.match(self._VALID_URL, url)
159
160         video_id = mobj.group('id')
161         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
162         webpage = self._download_webpage(mrss_url, video_id)
163
164         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
165         if mobj is None:
166             raise ExtractorError(u'Unable to extract media URL')
167         if len(mobj.group('server')) == 0:
168             video_url = compat_urllib_parse.unquote(mobj.group('file'))
169         else:
170             video_url = mobj.group('server')+'/key='+mobj.group('file')
171         video_extension = video_url.split('.')[-1]
172
173         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
174             webpage, u'title')
175
176         # Can't see the description anywhere in the UI
177         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
178         #     webpage, u'description', fatal=False)
179         # if video_description: video_description = unescapeHTML(video_description)
180
181         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
182         if mobj:
183             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
184         else:
185             video_upload_date = None
186             self._downloader.report_warning(u'Unable to extract upload date')
187
188         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
189             webpage, u'uploader id', default=u'anonymous')
190
191         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
192             webpage, u'thumbnail', fatal=False)
193
194         return [{
195             'id':       video_id,
196             'url':      video_url,
197             'ext':      video_extension,
198             'title':    video_title,
199             # 'description': video_description,
200             'upload_date': video_upload_date,
201             'uploader_id': video_uploader_id,
202             'thumbnail': video_thumbnail
203         }]
204
205
206
207
208
209 def gen_extractors():
210     """ Return a list of an instance of every supported extractor.
211     The order does matter; the first extractor matched is the one handling the URL.
212     """
213     return [
214         YoutubePlaylistIE(),
215         YoutubeChannelIE(),
216         YoutubeUserIE(),
217         YoutubeSearchIE(),
218         YoutubeIE(),
219         MetacafeIE(),
220         DailymotionIE(),
221         GoogleSearchIE(),
222         PhotobucketIE(),
223         YahooIE(),
224         YahooSearchIE(),
225         DepositFilesIE(),
226         FacebookIE(),
227         BlipTVIE(),
228         BlipTVUserIE(),
229         VimeoIE(),
230         MyVideoIE(),
231         ComedyCentralIE(),
232         EscapistIE(),
233         CollegeHumorIE(),
234         XVideosIE(),
235         SoundcloudSetIE(),
236         SoundcloudIE(),
237         InfoQIE(),
238         MixcloudIE(),
239         StanfordOpenClassroomIE(),
240         MTVIE(),
241         YoukuIE(),
242         XNXXIE(),
243         YouJizzIE(),
244         PornotubeIE(),
245         YouPornIE(),
246         GooglePlusIE(),
247         ArteTvIE(),
248         NBAIE(),
249         WorldStarHipHopIE(),
250         JustinTVIE(),
251         FunnyOrDieIE(),
252         SteamIE(),
253         UstreamIE(),
254         RBMARadioIE(),
255         EightTracksIE(),
256         KeekIE(),
257         TEDIE(),
258         MySpassIE(),
259         SpiegelIE(),
260         LiveLeakIE(),
261         ARDIE(),
262         ZDFIE(),
263         TumblrIE(),
264         BandcampIE(),
265         RedTubeIE(),
266         InaIE(),
267         HowcastIE(),
268         VineIE(),
269         FlickrIE(),
270         TeamcocoIE(),
271         XHamsterIE(),
272         HypemIE(),
273         Vbox7IE(),
274         GametrailersIE(),
275         StatigramIE(),
276         GenericIE()
277     ]
278
279 def get_info_extractor(ie_name):
280     """Returns the info extractor class with the given ie_name"""
281     return globals()[ie_name+'IE']