Merge remote-tracking branch 'jaimeMF/yt-toplists'

author Philipp Hagemeister <phihag@phihag.de>

Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
diff --combined youtube_dl/extractor/__init__.py

index 2b78cc84dc09e5d67f609d9c29ccf48d71be3e5b,0abf86e44ca2590eeb0899eb7bec540a5b5320c0..3f740baa13ff8c2c5f6891cc32042ed14b10188c
--- 1/youtube_dl/extractor/__init__.py
--- 2/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@@ -8,7 -8,6 +8,7 @@@ from .arte import 
       ArteTVPlus7IE,
       ArteTVCreativeIE,
       ArteTVFutureIE,
+ +    ArteTVDDCIE,
   )
   from .auengine import AUEngineIE
   from .bambuser import BambuserIE, BambuserChannelIE
@@@ -57,7 -56,7 +57,7 @@@ from .flickr import FlickrI
   from .francetv import (
       PluzzIE,
       FranceTvInfoIE,
- -    France2IE,
+ +    FranceTVIE,
       GenerationQuoiIE
   )
   from .freesound import FreesoundIE
@@@ -103,7 -102,6 +103,7 @@@ from .nbc import NBCNewsI
   from .newgrounds import NewgroundsIE
   from .nhl import NHLIE, NHLVideocenterIE
   from .niconico import NiconicoIE
+ +from .ninegag import NineGagIE
   from .nowvideo import NowVideoIE
   from .ooyala import OoyalaIE
   from .orf import ORFIE
@@@ -112,7 -110,6 +112,7 @@@ from .photobucket import PhotobucketI
   from .podomatic import PodomaticIE
   from .pornhub import PornHubIE
   from .pornotube import PornotubeIE
+ +from .pyvideo import PyvideoIE
   from .rbmaradio import RBMARadioIE
   from .redtube import RedTubeIE
   from .ringtv import RingTVIE
@@@ -124,12 -121,6 +124,12 @@@ from .rutube import RutubeI
   from .sina import SinaIE
   from .slashdot import SlashdotIE
   from .slideshare import SlideshareIE
+ +from .smotri import (
+ +    SmotriIE,
+ +    SmotriCommunityIE,
+ +    SmotriUserIE,
+ +    SmotriBroadcastIE,
+ +)
   from .sohu import SohuIE
   from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
   from .southparkstudios import (
@@@ -148,7 -139,6 +148,7 @@@ from .teamcoco import TeamcocoI
   from .techtalks import TechTalksIE
   from .ted import TEDIE
   from .tf1 import TF1IE
+ +from .theplatform import ThePlatformIE
   from .thisav import ThisAVIE
   from .toutv import TouTvIE
   from .traileraddict import TrailerAddictIE
@@@ -169,13 -159,7 +169,13 @@@ from .viddler import ViddlerI
   from .videodetective import VideoDetectiveIE
   from .videofyme import VideofyMeIE
   from .videopremium import VideoPremiumIE
- -from .vimeo import VimeoIE, VimeoChannelIE
+ +from .vimeo import (
+ +    VimeoIE,
+ +    VimeoChannelIE,
+ +    VimeoUserIE,
+ +    VimeoAlbumIE,
+ +    VimeoGroupsIE,
+ +)
   from .vine import VineIE
   from .viki import VikiIE
   from .vk import VKIE
@@@ -183,7 -167,6 +183,7 @@@ from .wat import WatI
   from .websurg import WeBSurgIE
   from .weibo import WeiboIE
   from .wimp import WimpIE
+ +from .wistia import WistiaIE
   from .worldstarhiphop import WorldStarHipHopIE
   from .xhamster import XHamsterIE
   from .xnxx import XNXXIE
@@@ -211,6 -194,7 +211,7 @@@ from .youtube import 
       YoutubeWatchLaterIE,
       YoutubeFavouritesIE,
       YoutubeHistoryIE,
+     YoutubeTopListIE,
   )
   from .zdf import ZDFIE
   
diff --combined youtube_dl/extractor/youtube.py

index 7f7508c74f1b98de4d93ee51472d0b32f9184e98,a1a4d896debdf8fd7c38fa38d3ad95023e12302e..874429b78cc4917ca1cbbec7245c85436dd73783
--- 1/youtube_dl/extractor/youtube.py
--- 2/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@@ -7,6 -7,7 +7,6 @@@ import itertool
   import json
   import os.path
   import re
- -import socket
   import string
   import struct
   import traceback
@@@ -16,7 -17,9 +16,7 @@@ from .common import InfoExtractor, Sear
   from .subtitles import SubtitlesInfoExtractor
   from ..utils import (
       compat_chr,
- -    compat_http_client,
       compat_parse_qs,
- -    compat_urllib_error,
       compat_urllib_parse,
       compat_urllib_request,
       compat_urlparse,
@@@ -42,11 -45,19 +42,11 @@@ class YoutubeBaseInfoExtractor(InfoExtr
       # If True it will raise an error if no login info is provided
       _LOGIN_REQUIRED = False
   
- -    def report_lang(self):
- -        """Report attempt to set language."""
- -        self.to_screen(u'Setting language')
- -
       def _set_language(self):
- -        request = compat_urllib_request.Request(self._LANG_URL)
- -        try:
- -            self.report_lang()
- -            compat_urllib_request.urlopen(request).read()
- -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- -            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
- -            return False
- -        return True
+ +        return bool(self._download_webpage(
+ +            self._LANG_URL, None,
+ +            note=u'Setting language', errnote='unable to set language',
+ +            fatal=False))
   
       def _login(self):
           (username, password) = self._get_login_info()
@@@ -56,12 -67,12 +56,12 @@@
                   raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
               return False
   
- -        request = compat_urllib_request.Request(self._LOGIN_URL)
- -        try:
- -            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- -            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
- -            return False
+ +        login_page = self._download_webpage(
+ +            self._LOGIN_URL, None,
+ +            note=u'Downloading login page',
+ +            errnote=u'unable to fetch login page', fatal=False)
+ +        if login_page is False:
+ +            return
   
           galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
                                     login_page, u'Login GALX parameter')
@@@ -91,28 -102,29 +91,28 @@@
           # chokes on unicode
           login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
           login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
- -        request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
- -        try:
- -            self.report_login()
- -            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
- -            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
- -                self._downloader.report_warning(u'unable to log in: bad username or password')
- -                return False
- -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- -            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+ +
+ +        req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+ +        login_results = self._download_webpage(
+ +            req, None,
+ +            note=u'Logging in', errnote=u'unable to log in', fatal=False)
+ +        if login_results is False:
+ +            return False
+ +        if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+ +            self._downloader.report_warning(u'unable to log in: bad username or password')
               return False
           return True
   
       def _confirm_age(self):
           age_form = {
- -                'next_url':     '/',
- -                'action_confirm':   'Confirm',
- -                }
- -        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
- -        try:
- -            self.report_age_confirmation()
- -            compat_urllib_request.urlopen(request).read().decode('utf-8')
- -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- -            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+ +            'next_url': '/',
+ +            'action_confirm': 'Confirm',
+ +        }
+ +        req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+ +
+ +        self._download_webpage(
+ +            req, None,
+ +            note=u'Confirming age', errnote=u'Unable to confirm age')
           return True
   
       def _real_initialize(self):
@@@ -324,7 -336,7 +324,7 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
                   u"uploader": u"Philipp Hagemeister",
                   u"uploader_id": u"phihag",
                   u"upload_date": u"20121002",
- -                u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+ +                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
               }
           },
           {
@@@ -376,6 -388,10 +376,6 @@@
           super(YoutubeIE, self).__init__(*args, **kwargs)
           self._player_cache = {}
   
- -    def report_video_webpage_download(self, video_id):
- -        """Report attempt to download video webpage."""
- -        self.to_screen(u'%s: Downloading video webpage' % video_id)
- -
       def report_video_info_webpage_download(self, video_id):
           """Report attempt to download video info webpage."""
           self.to_screen(u'%s: Downloading video info webpage' % video_id)
@@@ -1242,8 -1258,15 +1242,8 @@@
           video_id = self._extract_id(url)
   
           # Get video webpage
- -        self.report_video_webpage_download(video_id)
           url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
- -        request = compat_urllib_request.Request(url)
- -        try:
- -            video_webpage_bytes = compat_urllib_request.urlopen(request).read()
- -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- -            raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
- -
- -        video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
+ +        video_webpage = self._download_webpage(url, video_id)
   
           # Attempt to extract SWF player URL
           mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
@@@ -1343,15 -1366,6 +1343,15 @@@
           # description
           video_description = get_element_by_id("eow-description", video_webpage)
           if video_description:
+ +            video_description = re.sub(r'''(?x)
+ +                <a\s+
+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ +                    title="([^"]+)"\s+
+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ +                    class="yt-uix-redirect-link"\s*>
+ +                [^<]+
+ +                </a>
+ +            ''', r'\1', video_description)
               video_description = clean_html(video_description)
           else:
               fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
@@@ -1360,14 -1374,6 +1360,14 @@@
               else:
                   video_description = u''
   
+ +        def _extract_count(klass):
+ +            count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False)
+ +            if count is not None:
+ +                return int(count.replace(',', ''))
+ +            return None
+ +        like_count = _extract_count(u'likes-count')
+ +        dislike_count = _extract_count(u'dislikes-count')
+ +
           # subtitles
           video_subtitles = self.extract_subtitles(video_id, video_webpage)
   
@@@ -1500,8 -1506,6 +1500,8 @@@
                   'annotations':  video_annotations,
                   'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
                   'view_count': view_count,
+ +                'like_count': like_count,
+ +                'dislike_count': dislike_count,
               })
           return results
   
@@@ -1516,10 -1520,10 +1516,10 @@@ class YoutubePlaylistIE(YoutubeBaseInfo
                              \? (?:.*?&)*? (?:p|a|list)=
                           |  p/
                           )
- -                        ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
+ +                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
                           .*
                        |
- -                        ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
+ +                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
                        )"""
       _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
       _MORE_PAGES_INDICATOR = r'data-link-type="next"'
@@@ -1541,7 -1545,7 +1541,7 @@@
       def _extract_mix(self, playlist_id):
           # The mixes are generated from a a single video
           # the id of the playlist is just 'RD' + video_id
- -        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
+ +        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
           webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
           title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
               get_element_by_attribute('class', 'title ', webpage))
@@@ -1569,9 -1573,12 +1569,12 @@@
               else:
                   self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
   
- -        if len(playlist_id) == 13:  # 'RD' + 11 characters for the video id
+ +        if playlist_id.startswith('RD'):
               # Mixes require a custom extraction process
               return self._extract_mix(playlist_id)
+         if playlist_id.startswith('TL'):
+             raise ExtractorError(u'For downloading YouTube.com top lists, use '
+                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
   
           # Extract the video ids from the playlist pages
           ids = []
@@@ -1594,6 -1601,38 +1597,38 @@@
           return self.playlist_result(url_results, playlist_id, playlist_title)
   
   
+ class YoutubeTopListIE(YoutubePlaylistIE):
+     IE_NAME = u'youtube:toplist'
+     IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
+         u' (Example: "yttoplist:music:Top Tracks")')
+     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
+ 
+     def _real_extract(self, url):
+         mobj = re.match(self._VALID_URL, url)
+         channel = mobj.group('chann')
+         title = mobj.group('title')
+         query = compat_urllib_parse.urlencode({'title': title})
+         playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
+         channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
+         link = self._html_search_regex(playlist_re, channel_page, u'list')
+         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
+         
+         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
+         ids = []
+         # sometimes the webpage doesn't contain the videos
+         # retry until we get them
+         for i in itertools.count(0):
+             msg = u'Downloading Youtube mix'
+             if i > 0:
+                 msg += ', retry #%d' % i
+             webpage = self._download_webpage(url, title, msg)
+             ids = orderedSet(re.findall(video_re, webpage))
+             if ids:
+                 break
+         url_results = self._ids_to_results(ids)
+         return self.playlist_result(url_results, playlist_title=title)
+ 
+ 
   class YoutubeChannelIE(InfoExtractor):
       IE_DESC = u'YouTube.com channels'
       _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
@@@ -1619,11 -1658,10 +1654,11 @@@
           video_ids = []
           url = 'https://www.youtube.com/channel/%s/videos' % channel_id
           channel_page = self._download_webpage(url, channel_id)
- -        if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
- -            autogenerated = True
- -        else:
- -            autogenerated = False
+ +        autogenerated = re.search(r'''(?x)
+ +                class="[^"]*?(?:
+ +                    channel-header-autogenerated-label|
+ +                    yt-channel-title-autogenerated
+ +                )[^"]*"''', channel_page) is not None
   
           if autogenerated:
               # The videos are contained in a single page
@@@ -1725,6 -1763,10 +1760,6 @@@ class YoutubeSearchIE(SearchInfoExtract
       IE_NAME = u'youtube:search'
       _SEARCH_KEY = 'ytsearch'
   
- -    def report_download_page(self, query, pagenum):
- -        """Report attempt to download search page with given number."""
- -        self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
- -
       def _get_n_results(self, query, n):
           """Get a specified number of results for a query"""
   
@@@ -1733,15 -1775,16 +1768,15 @@@
           limit = n
   
           while (50 * pagenum) < limit:
- -            self.report_download_page(query, pagenum+1)
               result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
- -            request = compat_urllib_request.Request(result_url)
- -            try:
- -                data = compat_urllib_request.urlopen(request).read().decode('utf-8')
- -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- -                raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
- -            api_response = json.loads(data)['data']
- -
- -            if not 'items' in api_response:
+ +            data_json = self._download_webpage(
+ +                result_url, video_id=u'query "%s"' % query,
+ +                note=u'Downloading page %s' % (pagenum + 1),
+ +                errnote=u'Unable to download API page')
+ +            data = json.loads(data_json)
+ +            api_response = data['data']
+ +
+ +            if 'items' not in api_response:
                   raise ExtractorError(u'[youtube] No video results')
   
               new_ids = list(video['id'] for video in api_response['items'])
@@@ -1757,7 -1800,6 +1792,7 @@@
           return self.playlist_result(videos, query)
   
   class YoutubeSearchDateIE(YoutubeSearchIE):
+ +    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
       _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
       _SEARCH_KEY = 'ytsearchdate'
       IE_DESC = u'YouTube.com searches, newest videos first'
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 9 Dec 2013 03:49:32 +0000 (04:49 +0100)
		1	2
youtube_dl/extractor/__init__.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/extractor/youtube.py	patch \|	diff1 \|	diff2 \|	blob \| history