Merge remote-tracking branch 'jaimeMF/yt-toplists'
authorPhilipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
1  2 
youtube_dl/extractor/__init__.py
youtube_dl/extractor/youtube.py

index 2b78cc84dc09e5d67f609d9c29ccf48d71be3e5b,0abf86e44ca2590eeb0899eb7bec540a5b5320c0..3f740baa13ff8c2c5f6891cc32042ed14b10188c
@@@ -8,7 -8,6 +8,7 @@@ from .arte import 
      ArteTVPlus7IE,
      ArteTVCreativeIE,
      ArteTVFutureIE,
 +    ArteTVDDCIE,
  )
  from .auengine import AUEngineIE
  from .bambuser import BambuserIE, BambuserChannelIE
@@@ -57,7 -56,7 +57,7 @@@ from .flickr import FlickrI
  from .francetv import (
      PluzzIE,
      FranceTvInfoIE,
 -    France2IE,
 +    FranceTVIE,
      GenerationQuoiIE
  )
  from .freesound import FreesoundIE
@@@ -103,7 -102,6 +103,7 @@@ from .nbc import NBCNewsI
  from .newgrounds import NewgroundsIE
  from .nhl import NHLIE, NHLVideocenterIE
  from .niconico import NiconicoIE
 +from .ninegag import NineGagIE
  from .nowvideo import NowVideoIE
  from .ooyala import OoyalaIE
  from .orf import ORFIE
@@@ -112,7 -110,6 +112,7 @@@ from .photobucket import PhotobucketI
  from .podomatic import PodomaticIE
  from .pornhub import PornHubIE
  from .pornotube import PornotubeIE
 +from .pyvideo import PyvideoIE
  from .rbmaradio import RBMARadioIE
  from .redtube import RedTubeIE
  from .ringtv import RingTVIE
@@@ -124,12 -121,6 +124,12 @@@ from .rutube import RutubeI
  from .sina import SinaIE
  from .slashdot import SlashdotIE
  from .slideshare import SlideshareIE
 +from .smotri import (
 +    SmotriIE,
 +    SmotriCommunityIE,
 +    SmotriUserIE,
 +    SmotriBroadcastIE,
 +)
  from .sohu import SohuIE
  from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
  from .southparkstudios import (
@@@ -148,7 -139,6 +148,7 @@@ from .teamcoco import TeamcocoI
  from .techtalks import TechTalksIE
  from .ted import TEDIE
  from .tf1 import TF1IE
 +from .theplatform import ThePlatformIE
  from .thisav import ThisAVIE
  from .toutv import TouTvIE
  from .traileraddict import TrailerAddictIE
@@@ -169,13 -159,7 +169,13 @@@ from .viddler import ViddlerI
  from .videodetective import VideoDetectiveIE
  from .videofyme import VideofyMeIE
  from .videopremium import VideoPremiumIE
 -from .vimeo import VimeoIE, VimeoChannelIE
 +from .vimeo import (
 +    VimeoIE,
 +    VimeoChannelIE,
 +    VimeoUserIE,
 +    VimeoAlbumIE,
 +    VimeoGroupsIE,
 +)
  from .vine import VineIE
  from .viki import VikiIE
  from .vk import VKIE
@@@ -183,7 -167,6 +183,7 @@@ from .wat import WatI
  from .websurg import WeBSurgIE
  from .weibo import WeiboIE
  from .wimp import WimpIE
 +from .wistia import WistiaIE
  from .worldstarhiphop import WorldStarHipHopIE
  from .xhamster import XHamsterIE
  from .xnxx import XNXXIE
@@@ -211,6 -194,7 +211,7 @@@ from .youtube import 
      YoutubeWatchLaterIE,
      YoutubeFavouritesIE,
      YoutubeHistoryIE,
+     YoutubeTopListIE,
  )
  from .zdf import ZDFIE
  
index 7f7508c74f1b98de4d93ee51472d0b32f9184e98,a1a4d896debdf8fd7c38fa38d3ad95023e12302e..874429b78cc4917ca1cbbec7245c85436dd73783
@@@ -7,6 -7,7 +7,6 @@@ import itertool
  import json
  import os.path
  import re
 -import socket
  import string
  import struct
  import traceback
@@@ -16,7 -17,9 +16,7 @@@ from .common import InfoExtractor, Sear
  from .subtitles import SubtitlesInfoExtractor
  from ..utils import (
      compat_chr,
 -    compat_http_client,
      compat_parse_qs,
 -    compat_urllib_error,
      compat_urllib_parse,
      compat_urllib_request,
      compat_urlparse,
@@@ -42,11 -45,19 +42,11 @@@ class YoutubeBaseInfoExtractor(InfoExtr
      # If True it will raise an error if no login info is provided
      _LOGIN_REQUIRED = False
  
 -    def report_lang(self):
 -        """Report attempt to set language."""
 -        self.to_screen(u'Setting language')
 -
      def _set_language(self):
 -        request = compat_urllib_request.Request(self._LANG_URL)
 -        try:
 -            self.report_lang()
 -            compat_urllib_request.urlopen(request).read()
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 -            return False
 -        return True
 +        return bool(self._download_webpage(
 +            self._LANG_URL, None,
 +            note=u'Setting language', errnote='unable to set language',
 +            fatal=False))
  
      def _login(self):
          (username, password) = self._get_login_info()
                  raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
              return False
  
 -        request = compat_urllib_request.Request(self._LOGIN_URL)
 -        try:
 -            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 -            return False
 +        login_page = self._download_webpage(
 +            self._LOGIN_URL, None,
 +            note=u'Downloading login page',
 +            errnote=u'unable to fetch login page', fatal=False)
 +        if login_page is False:
 +            return
  
          galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
                                    login_page, u'Login GALX parameter')
          # chokes on unicode
          login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
          login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 -        request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 -        try:
 -            self.report_login()
 -            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 -            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 -                self._downloader.report_warning(u'unable to log in: bad username or password')
 -                return False
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 +
 +        req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 +        login_results = self._download_webpage(
 +            req, None,
 +            note=u'Logging in', errnote=u'unable to log in', fatal=False)
 +        if login_results is False:
 +            return False
 +        if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 +            self._downloader.report_warning(u'unable to log in: bad username or password')
              return False
          return True
  
      def _confirm_age(self):
          age_form = {
 -                'next_url':     '/',
 -                'action_confirm':   'Confirm',
 -                }
 -        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 -        try:
 -            self.report_age_confirmation()
 -            compat_urllib_request.urlopen(request).read().decode('utf-8')
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 +            'next_url': '/',
 +            'action_confirm': 'Confirm',
 +        }
 +        req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 +
 +        self._download_webpage(
 +            req, None,
 +            note=u'Confirming age', errnote=u'Unable to confirm age')
          return True
  
      def _real_initialize(self):
@@@ -324,7 -336,7 +324,7 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
                  u"uploader": u"Philipp Hagemeister",
                  u"uploader_id": u"phihag",
                  u"upload_date": u"20121002",
 -                u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
 +                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
              }
          },
          {
          super(YoutubeIE, self).__init__(*args, **kwargs)
          self._player_cache = {}
  
 -    def report_video_webpage_download(self, video_id):
 -        """Report attempt to download video webpage."""
 -        self.to_screen(u'%s: Downloading video webpage' % video_id)
 -
      def report_video_info_webpage_download(self, video_id):
          """Report attempt to download video info webpage."""
          self.to_screen(u'%s: Downloading video info webpage' % video_id)
          video_id = self._extract_id(url)
  
          # Get video webpage
 -        self.report_video_webpage_download(video_id)
          url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 -        request = compat_urllib_request.Request(url)
 -        try:
 -            video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 -
 -        video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 +        video_webpage = self._download_webpage(url, video_id)
  
          # Attempt to extract SWF player URL
          mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
          # description
          video_description = get_element_by_id("eow-description", video_webpage)
          if video_description:
 +            video_description = re.sub(r'''(?x)
 +                <a\s+
 +                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
 +                    title="([^"]+)"\s+
 +                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
 +                    class="yt-uix-redirect-link"\s*>
 +                [^<]+
 +                </a>
 +            ''', r'\1', video_description)
              video_description = clean_html(video_description)
          else:
              fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
              else:
                  video_description = u''
  
 +        def _extract_count(klass):
 +            count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False)
 +            if count is not None:
 +                return int(count.replace(',', ''))
 +            return None
 +        like_count = _extract_count(u'likes-count')
 +        dislike_count = _extract_count(u'dislikes-count')
 +
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
  
                  'annotations':  video_annotations,
                  'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
                  'view_count': view_count,
 +                'like_count': like_count,
 +                'dislike_count': dislike_count,
              })
          return results
  
@@@ -1516,10 -1520,10 +1516,10 @@@ class YoutubePlaylistIE(YoutubeBaseInfo
                             \? (?:.*?&)*? (?:p|a|list)=
                          |  p/
                          )
 -                        ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
 +                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
                          .*
                       |
 -                        ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
 +                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
                       )"""
      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
      _MORE_PAGES_INDICATOR = r'data-link-type="next"'
      def _extract_mix(self, playlist_id):
          # The mixes are generated from a a single video
          # the id of the playlist is just 'RD' + video_id
 -        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
 +        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
          webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
          title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
              get_element_by_attribute('class', 'title ', webpage))
              else:
                  self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
 -        if len(playlist_id) == 13:  # 'RD' + 11 characters for the video id
 +        if playlist_id.startswith('RD'):
              # Mixes require a custom extraction process
              return self._extract_mix(playlist_id)
+         if playlist_id.startswith('TL'):
+             raise ExtractorError(u'For downloading YouTube.com top lists, use '
+                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
  
          # Extract the video ids from the playlist pages
          ids = []
          return self.playlist_result(url_results, playlist_id, playlist_title)
  
  
+ class YoutubeTopListIE(YoutubePlaylistIE):
+     IE_NAME = u'youtube:toplist'
+     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
+         u' (Example: "yttoplist:music:Top Tracks")')
+     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
+     def _real_extract(self, url):
+         mobj = re.match(self._VALID_URL, url)
+         channel = mobj.group('chann')
+         title = mobj.group('title')
+         query = compat_urllib_parse.urlencode({'title': title})
+         playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
+         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
+         link = self._html_search_regex(playlist_re, channel_page, u'list')
+         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
+         
+         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
+         ids = []
+         # sometimes the webpage doesn't contain the videos
+         # retry until we get them
+         for i in itertools.count(0):
+             msg = u'Downloading Youtube mix'
+             if i > 0:
+                 msg += ', retry #%d' % i
+             webpage = self._download_webpage(url, title, msg)
+             ids = orderedSet(re.findall(video_re, webpage))
+             if ids:
+                 break
+         url_results = self._ids_to_results(ids)
+         return self.playlist_result(url_results, playlist_title=title)
  class YoutubeChannelIE(InfoExtractor):
      IE_DESC = u'YouTube.com channels'
      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
          video_ids = []
          url = 'https://www.youtube.com/channel/%s/videos' % channel_id
          channel_page = self._download_webpage(url, channel_id)
 -        if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
 -            autogenerated = True
 -        else:
 -            autogenerated = False
 +        autogenerated = re.search(r'''(?x)
 +                class="[^"]*?(?:
 +                    channel-header-autogenerated-label|
 +                    yt-channel-title-autogenerated
 +                )[^"]*"''', channel_page) is not None
  
          if autogenerated:
              # The videos are contained in a single page
@@@ -1725,6 -1763,10 +1760,6 @@@ class YoutubeSearchIE(SearchInfoExtract
      IE_NAME = u'youtube:search'
      _SEARCH_KEY = 'ytsearch'
  
 -    def report_download_page(self, query, pagenum):
 -        """Report attempt to download search page with given number."""
 -        self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 -
      def _get_n_results(self, query, n):
          """Get a specified number of results for a query"""
  
          limit = n
  
          while (50 * pagenum) < limit:
 -            self.report_download_page(query, pagenum+1)
              result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
 -            request = compat_urllib_request.Request(result_url)
 -            try:
 -                data = compat_urllib_request.urlopen(request).read().decode('utf-8')
 -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -                raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
 -            api_response = json.loads(data)['data']
 -
 -            if not 'items' in api_response:
 +            data_json = self._download_webpage(
 +                result_url, video_id=u'query "%s"' % query,
 +                note=u'Downloading page %s' % (pagenum + 1),
 +                errnote=u'Unable to download API page')
 +            data = json.loads(data_json)
 +            api_response = data['data']
 +
 +            if 'items' not in api_response:
                  raise ExtractorError(u'[youtube] No video results')
  
              new_ids = list(video['id'] for video in api_response['items'])
          return self.playlist_result(videos, query)
  
  class YoutubeSearchDateIE(YoutubeSearchIE):
 +    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
      _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
      _SEARCH_KEY = 'ytsearchdate'
      IE_DESC = u'YouTube.com searches, newest videos first'