Merge pull request #2274 from z00nx/master
authorFilippo Valsorda <filosottile.wiki@gmail.com>
Mon, 3 Feb 2014 01:31:57 +0000 (17:31 -0800)
committerFilippo Valsorda <filosottile.wiki@gmail.com>
Mon, 3 Feb 2014 01:31:57 +0000 (17:31 -0800)
[bliptv] Filter out SRT files

24 files changed:
README.md
youtube-dl.plugin.zsh [new file with mode: 0644]
youtube_dl/extractor/__init__.py
youtube_dl/extractor/cliphunter.py
youtube_dl/extractor/collegehumor.py
youtube_dl/extractor/common.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/infoq.py
youtube_dl/extractor/lifenews.py [new file with mode: 0644]
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/newgrounds.py
youtube_dl/extractor/normalboots.py [new file with mode: 0644]
youtube_dl/extractor/rbmaradio.py
youtube_dl/extractor/ro220.py
youtube_dl/extractor/southparkstudios.py
youtube_dl/extractor/spiegel.py
youtube_dl/extractor/tinypic.py [new file with mode: 0644]
youtube_dl/extractor/traileraddict.py
youtube_dl/extractor/ustream.py
youtube_dl/extractor/vevo.py
youtube_dl/extractor/vube.py [new file with mode: 0644]
youtube_dl/extractor/xhamster.py
youtube_dl/version.py

index d795ef6f28aa8a3cbc9e7d6c21ce3bc6958aa8de..5200f59fe7b5da4bd5a51374f5eda27afdf4e756 100644 (file)
--- a/README.md
+++ b/README.md
@@ -325,7 +325,7 @@ Since June 2012 (#342) youtube-dl is packed as an executable zipfile, simply unz
 
 To run the exe you need to install first the [Microsoft Visual C++ 2008 Redistributable Package](http://www.microsoft.com/en-us/download/details.aspx?id=29).
 
-# BUILD INSTRUCTIONS
+# DEVELOPER INSTRUCTIONS
 
 Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
 
@@ -347,6 +347,10 @@ If you want to create a build of youtube-dl yourself, you'll need
 * zip
 * nosetests
 
+### Adding support for a new site
+
+If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py Test_Download.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
+
 # BUGS
 
 Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues> . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email.
diff --git a/youtube-dl.plugin.zsh b/youtube-dl.plugin.zsh
new file mode 100644 (file)
index 0000000..4edab52
--- /dev/null
@@ -0,0 +1,24 @@
+# This allows the youtube-dl command to be installed in ZSH using antigen.
+# Antigen is a bundle manager. It allows you to enhance the functionality of
+# your zsh session by installing bundles and themes easily.
+
+# Antigen documentation:
+# http://antigen.sharats.me/
+# https://github.com/zsh-users/antigen
+
+# Install youtube-dl:
+# antigen bundle rg3/youtube-dl
+# Bundles installed by antigen are available for use immediately.
+
+# Update youtube-dl (and all other antigen bundles):
+# antigen update
+
+# The antigen command will download the git repository to a folder and then
+# execute an enabling script (this file). The complete process for loading the
+# code is documented here:
+# https://github.com/zsh-users/antigen#notes-on-writing-plugins
+
+# This specific script just aliases youtube-dl to the python script that this
+# library provides. This requires updating the PYTHONPATH to ensure that the
+# full set of code can be located.
+alias youtube-dl="PYTHONPATH=$(dirname $0) $(dirname $0)/bin/youtube-dl"
index e89b5cf9def65965f525b1e706bb7fb691a67228..073f3a0d145cd76f358ec43fb08761d0977694ee 100644 (file)
@@ -110,6 +110,7 @@ from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
 from .keek import KeekIE
 from .la7 import LA7IE
+from .lifenews import LifeNewsIE
 from .liveleak import LiveLeakIE
 from .livestream import LivestreamIE, LivestreamOriginalIE
 from .lynda import (
@@ -141,6 +142,7 @@ from .newgrounds import NewgroundsIE
 from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
 from .ninegag import NineGagIE
+from .normalboots import NormalbootsIE
 from .novamov import NovamovIE
 from .nowness import NownessIE
 from .nowvideo import NowVideoIE
@@ -198,6 +200,7 @@ from .ted import TEDIE
 from .tf1 import TF1IE
 from .theplatform import ThePlatformIE
 from .thisav import ThisAVIE
+from .tinypic import TinyPicIE
 from .toutv import TouTvIE
 from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
@@ -228,6 +231,7 @@ from .vimeo import (
 from .vine import VineIE
 from .viki import VikiIE
 from .vk import VKIE
+from .vube import VubeIE
 from .wat import WatIE
 from .weibo import WeiboIE
 from .wimp import WimpIE
index d891fa3012c7bf7e2fe59838002a014b7c0fb8bb..58846e8e7cfacc631015ce32ace23cbd9b8a2576 100644 (file)
@@ -1,12 +1,9 @@
 from __future__ import unicode_literals
 
 import re
-import string
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
+
 
 translation_table = {
     'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n',
index d10b7bd0cda2161838e83da741fe7a2f6c88f575..2b4bf34c920807c46eabbd919468ad2c47bc79ed 100644 (file)
@@ -28,7 +28,25 @@ class CollegeHumorIE(InfoExtractor):
             'description': 'This video wasn\'t long enough, so we made it double-spaced.',
             'age_limit': 10,
         },
-    }]
+    },
+    # embedded youtube video
+    {
+        'url': 'http://www.collegehumor.com/embed/6950457',
+        'info_dict': {
+            'id': 'W5gMp3ZjYg4',
+            'ext': 'mp4',
+            'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
+            'uploader': 'Funnyplox TV',
+            'uploader_id': 'funnyploxtv',
+            'description': 'md5:506f69f7a297ed698ced3375f2363b0e',
+            'upload_date': '20140128',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Youtube'],
+    },
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -38,6 +56,12 @@ class CollegeHumorIE(InfoExtractor):
         data = json.loads(self._download_webpage(
             jsonUrl, video_id, 'Downloading info JSON'))
         vdata = data['video']
+        if vdata.get('youtubeId') is not None:
+            return {
+                '_type': 'url',
+                'url': vdata['youtubeId'],
+                'ie_key': 'Youtube',
+            }
 
         AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0}
         rating = vdata.get('rating')
@@ -49,7 +73,7 @@ class CollegeHumorIE(InfoExtractor):
         PREFS = {'high_quality': 2, 'low_quality': 0}
         formats = []
         for format_key in ('mp4', 'webm'):
-            for qname, qurl in vdata[format_key].items():
+            for qname, qurl in vdata.get(format_key, {}).items():
                 formats.append({
                     'format_id': format_key + '_' + qname,
                     'url': qurl,
index 70ba9eaba3f1e4cc32207867ba3dc6a4ed0d2b10..2c0c75604b96cfd08a283c0d93ef36f85b525922 100644 (file)
@@ -399,7 +399,7 @@ class InfoExtractor(object):
     # Helper functions for extracting OpenGraph info
     @staticmethod
     def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
         template = r'<meta[^>]+?%s[^>]+?%s'
         return [
index e54009622aa353d618326ca51044aa567c2eef55..d65046f588d0bf4481ec6f8d8de7e031e6bdb2f9 100644 (file)
@@ -1,49 +1,60 @@
 from __future__ import unicode_literals
 
-import json
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     unescapeHTML,
+    find_xpath_attr,
 )
 
 
 class CSpanIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?c-spanvideo\.org/program/(?P<name>.*)'
+    _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)'
     IE_DESC = 'C-SPAN'
     _TEST = {
-        'url': 'http://www.c-spanvideo.org/program/HolderonV',
-        'file': '315139.mp4',
+        'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
         'md5': '8e44ce11f0f725527daccc453f553eb0',
         'info_dict': {
+            'id': '315139',
+            'ext': 'mp4',
             'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
-            'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
+            'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
         },
         'skip': 'Regularly fails on travis, for unknown reasons',
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        prog_name = mobj.group('name')
-        webpage = self._download_webpage(url, prog_name)
-        video_id = self._search_regex(r'prog(?:ram)?id=(.*?)&', webpage, 'video id')
-
-        title = self._html_search_regex(
-            r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title')
-        description = self._og_search_description(webpage)
+        page_id = mobj.group('id')
+        webpage = self._download_webpage(url, page_id)
+        video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id')
+
+        description = self._html_search_regex(
+            [
+                # The full description
+                r'<div class=\'expandable\'>(.*?)<a href=\'#\'',
+                # If the description is small enough the other div is not
+                # present, otherwise this is a stripped version
+                r'<p class=\'initial\'>(.*?)</p>'
+            ],
+            webpage, 'description', flags=re.DOTALL)
 
         info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
-        data_json = self._download_webpage(
-            info_url, video_id, 'Downloading video info')
-        data = json.loads(data_json)
+        data = self._download_json(info_url, video_id)
 
         url = unescapeHTML(data['video']['files'][0]['path']['#text'])
 
+        doc = self._download_xml('http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
+            video_id)
+
+        def find_string(s):
+            return find_xpath_attr(doc, './/string', 'name', s).text
+
         return {
             'id': video_id,
-            'title': title,
+            'title': find_string('title'),
             'url': url,
             'description': description,
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'thumbnail': find_string('poster'),
         }
index 7c208b85d21d58fb47c1be5d2db16598c10d1757..ed32373a1be5e3a20a0f7a1ca04c2da091ee68b6 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 
 class InfoQIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
+    _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
     _TEST = {
         "name": "InfoQ",
         "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things",
@@ -26,9 +26,9 @@ class InfoQIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
 
-        webpage = self._download_webpage(url, video_id=url)
-        self.report_extraction(url)
+        webpage = self._download_webpage(url, video_id)
 
         # Extract video URL
         encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id')
@@ -50,6 +50,6 @@ class InfoQIE(InfoExtractor):
             'id': video_id,
             'url': video_url,
             'title': video_title,
-            'ext': extension, # Extension is always(?) mp4, but seems to be flv
+            'ext': extension,  # Extension is always(?) mp4, but seems to be flv
             'description': video_description,
         }
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
new file mode 100644 (file)
index 0000000..4e4035b
--- /dev/null
@@ -0,0 +1,63 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class LifeNewsIE(InfoExtractor):
+    IE_NAME = 'lifenews'
+    IE_DESC = 'LIFE | NEWS'
+    _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)'
+    
+    _TEST = {
+        'url': 'http://lifenews.ru/news/126342',
+        'file': '126342.mp4',
+        'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
+        'info_dict': {
+            'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',
+            'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.',
+            'thumbnail': 'http://lifenews.ru/static/posts/2014/1/126342/.video.jpg',
+            'upload_date': '20140130',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page')
+
+        video_url = self._html_search_regex(
+            r'<video.*?src="([^"]+)"></video>', webpage, 'video URL')
+        
+        thumbnail = self._html_search_regex(
+            r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
+
+        title = self._og_search_title(webpage)
+        TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
+        if title.endswith(TITLE_SUFFIX):
+            title = title[:-len(TITLE_SUFFIX)]
+
+        description = self._og_search_description(webpage)
+
+        view_count = self._html_search_regex(
+            r'<div class=\'views\'>(\d+)</div>', webpage, 'view count')
+        comment_count = self._html_search_regex(
+            r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count')
+
+        upload_date = self._html_search_regex(
+            r'<time datetime=\'([^\']+)\'>', webpage, 'upload date')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'title': title,
+            'description': description,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'upload_date': unified_strdate(upload_date),
+        }
\ No newline at end of file
index 4e76c1f4a5f8c0a802ea95eaac6d20eb373998c3..0a700d663215df27f3553100862e9805e2c9f6ea 100644 (file)
@@ -4,9 +4,6 @@ import json
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
 
 
 class LiveLeakIE(InfoExtractor):
index f6f31bfdc53c6cb0685cfb74e9cc59b56269d77f..4521451ac933baabce786c76813e3e8407298b65 100644 (file)
@@ -119,7 +119,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
             if mgid.endswith('.swf'):
                 mgid = mgid[:-4]
         except RegexNotFoundError:
-            mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid')
+            mgid = self._search_regex(
+                [r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'],
+                webpage, u'mgid')
         return self._get_videos_info(mgid)
 
 
index 5cb83ba143ba27b072ef9ac80050654c314bc037..2e72e8915aab601b6916fd18e4f64090a613986e 100644 (file)
@@ -4,18 +4,18 @@ import json
 import re
 
 from .common import InfoExtractor
-from ..utils import determine_ext
 
 
 class NewgroundsIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://)?(?:www\.)?newgrounds\.com/audio/listen/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/audio/listen/(?P<id>[0-9]+)'
     _TEST = {
         'url': 'http://www.newgrounds.com/audio/listen/549479',
-        'file': '549479.mp3',
         'md5': 'fe6033d297591288fa1c1f780386f07a',
         'info_dict': {
-            "title": "B7 - BusMode",
-            "uploader": "Burn7",
+            'id': '549479',
+            'ext': 'mp3',
+            'title': 'B7 - BusMode',
+            'uploader': 'Burn7',
         }
     }
 
diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py
new file mode 100644 (file)
index 0000000..81b7855
--- /dev/null
@@ -0,0 +1,61 @@
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    ExtractorError,
+    unified_strdate,
+)
+
+class NormalbootsIE(InfoExtractor):
+    _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
+    _TEST = {
+        u'url': u'http://normalboots.com/video/home-alone-games-jontron/',
+        u'file': u'home-alone-games-jontron.mp4',
+        u'md5': u'8bf6de238915dd501105b44ef5f1e0f6',
+        u'info_dict': {
+            u'title': u'Home Alone Games - JonTron - NormalBoots',
+            u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/',
+            u'uploader': u'JonTron',
+            u'upload_date': u'20140125',
+        }
+    }
+    
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('videoid')
+        
+        info = {
+            'id': video_id,
+            'uploader': None,
+            'upload_date': None,
+        }
+        
+        if url[:4] != 'http':
+            url = 'http://' + url
+        
+        webpage = self._download_webpage(url, video_id)
+        video_title = self._og_search_title(webpage)
+        video_description = self._og_search_description(webpage)
+        video_thumbnail = self._og_search_thumbnail(webpage)
+        video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
+            webpage, 'uploader')
+        raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', 
+            webpage, 'date')
+        video_upload_date = unified_strdate(raw_upload_date)
+        video_upload_date = unified_strdate(raw_upload_date)
+            
+        player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url')
+        player_page = self._download_webpage(player_url, video_id)
+        video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file')
+        
+        info['url'] = video_url
+        info['title'] = video_title
+        info['description'] = video_description
+        info['thumbnail'] = video_thumbnail
+        info['uploader'] = video_uploader
+        info['upload_date'] = video_upload_date
+        
+        return info
index b9cb7abd14cef4714a063d1d21d18d14eee60be2..2c53ed2e1147a50248a4294c838ceef67688a356 100644 (file)
@@ -6,8 +6,6 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_urllib_parse_urlparse,
-
     ExtractorError,
 )
 
@@ -16,9 +14,10 @@ class RBMARadioIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
     _TEST = {
         'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011',
-        'file': 'ford-lopatin-live-at-primavera-sound-2011.mp3',
         'md5': '6bc6f9bcb18994b4c983bc3bf4384d95',
         'info_dict': {
+            'id': 'ford-lopatin-live-at-primavera-sound-2011',
+            'ext': 'mp3',
             "uploader_id": "ford-lopatin",
             "location": "Spain",
             "description": "Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.",
@@ -42,7 +41,6 @@ class RBMARadioIE(InfoExtractor):
             raise ExtractorError('Invalid JSON: ' + str(e))
 
         video_url = data['akamai_url'] + '&cbr=256'
-        url_parts = compat_urllib_parse_urlparse(video_url)
 
         return {
             'id': video_id,
index c32f64d99791dbc4d866ad0a2bc681fa5ae3609d..4678f62dfadba9968ff363a919471c869eb35c71 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -11,12 +13,12 @@ class Ro220IE(InfoExtractor):
     IE_NAME = '220.ro'
     _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)'
     _TEST = {
-        u"url": u"http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/",
-        u'file': u'LYV6doKo7f.mp4',
-        u'md5': u'03af18b73a07b4088753930db7a34add',
-        u'info_dict': {
-            u"title": u"Luati-le Banii sez 4 ep 1",
-            u"description": u"Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.",
+        "url": "http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/",
+        'file': 'LYV6doKo7f.mp4',
+        'md5': '03af18b73a07b4088753930db7a34add',
+        'info_dict': {
+            "title": "Luati-le Banii sez 4 ep 1",
+            "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.",
         }
     }
 
@@ -27,10 +29,10 @@ class Ro220IE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
         flashVars_str = self._search_regex(
             r'<param name="flashVars" value="([^"]+)"',
-            webpage, u'flashVars')
+            webpage, 'flashVars')
         flashVars = compat_parse_qs(flashVars_str)
 
-        info = {
+        return {
             '_type': 'video',
             'id': video_id,
             'ext': 'mp4',
@@ -39,4 +41,3 @@ class Ro220IE(InfoExtractor):
             'description': clean_html(flashVars['desc'][0]),
             'thumbnail': flashVars['preview'][0],
         }
-        return info
index 9f8d3a5fa830b09b322915c3e6f1ac72c4e936c8..aea8e64393afc9c9e41a1e2d2a09f9d379151992 100644 (file)
@@ -1,34 +1,36 @@
-import re
+from __future__ import unicode_literals
 
 from .mtv import MTVServicesInfoExtractor
 
 
 class SouthParkStudiosIE(MTVServicesInfoExtractor):
-    IE_NAME = u'southparkstudios.com'
-    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+    IE_NAME = 'southparkstudios.com'
+    _VALID_URL = r'https?://(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
 
     _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
 
     _TESTS = [{
-        u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
-        u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
-        u'info_dict': {
-            u'title': u'Bat Daded',
-            u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
+        'url': 'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
+        'info_dict': {
+            'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30',
+            'ext': 'mp4',
+            'title': 'Bat Daded',
+            'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.',
         },
     }]
 
 
 class SouthparkDeIE(SouthParkStudiosIE):
-    IE_NAME = u'southpark.de'
-    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+    IE_NAME = 'southpark.de'
+    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
     _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
 
     _TESTS = [{
-        u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
-        u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4',
-        u'info_dict': {
-            u'title': u'The Government Won\'t Respect My Privacy',
-            u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
+        'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
+        'info_dict': {
+            'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2',
+            'ext': 'mp4',
+            'title': 'The Government Won\'t Respect My Privacy',
+            'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
         },
     }]
index 051a34d5b8b048db9112ac2df5f3f04115447a87..9156d7fafd6ac2688ab329b31f1b683ab868def0 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -6,20 +8,20 @@ from .common import InfoExtractor
 class SpiegelIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
     _TESTS = [{
-        u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
-        u'file': u'1259285.mp4',
-        u'md5': u'2c2754212136f35fb4b19767d242f66e',
-        u'info_dict': {
-            u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv"
-        }
+        'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
+        'file': '1259285.mp4',
+        'md5': '2c2754212136f35fb4b19767d242f66e',
+        'info_dict': {
+            'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
+        },
     },
     {
-        u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
-        u'file': u'1309159.mp4',
-        u'md5': u'f2cdf638d7aa47654e251e1aee360af1',
-        u'info_dict': {
-            u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers'
-        }
+        'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
+        'file': '1309159.mp4',
+        'md5': 'f2cdf638d7aa47654e251e1aee360af1',
+        'info_dict': {
+            'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
+        },
     }]
 
     def _real_extract(self, url):
@@ -29,17 +31,17 @@ class SpiegelIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         video_title = self._html_search_regex(
-            r'<div class="module-title">(.*?)</div>', webpage, u'title')
+            r'<div class="module-title">(.*?)</div>', webpage, 'title')
 
-        xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
+        xml_url = 'http://video2.spiegel.de/flash/' + video_id + '.xml'
         idoc = self._download_xml(
             xml_url, video_id,
-            note=u'Downloading XML', errnote=u'Failed to download XML')
+            note='Downloading XML', errnote='Failed to download XML')
 
         formats = [
             {
                 'format_id': n.tag.rpartition('type')[2],
-                'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text,
+                'url': 'http://video2.spiegel.de/flash/' + n.find('./filename').text,
                 'width': int(n.find('./width').text),
                 'height': int(n.find('./height').text),
                 'abr': int(n.find('./audiobitrate').text),
@@ -55,10 +57,9 @@ class SpiegelIE(InfoExtractor):
 
         self._sort_formats(formats)
 
-        info = {
+        return {
             'id': video_id,
             'title': video_title,
             'duration': duration,
             'formats': formats,
         }
-        return info
diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py
new file mode 100644 (file)
index 0000000..2246d27
--- /dev/null
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from youtube_dl.utils import ExtractorError
+
+
+class TinyPicIE(InfoExtractor):
+    IE_NAME = 'tinypic'
+    IE_DESC = 'tinypic.com videos'
+    _VALID_URL = r'http://tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
+
+    _TEST = {
+        'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
+        'md5': '609b74432465364e72727ebc6203f044',
+        'info_dict': {
+            'id': '6xw7tc',
+            'ext': 'flv',
+            'title': 'shadow phenomenon weird',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id, 'Downloading page')
+        
+        mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n'
+            '\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage)
+        if mobj is None:
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        file_id = mobj.group('fileid')
+        server_id = mobj.group('serverid')
+
+        KEYWORDS_SUFFIX = ', Video, images, photos, videos, myspace, ebay, video hosting, photo hosting'
+        keywords = self._html_search_meta('keywords', webpage, 'title')
+        title = keywords[:-len(KEYWORDS_SUFFIX)] if keywords.endswith(KEYWORDS_SUFFIX) else ''
+
+        video_url = 'http://v%s.tinypic.com/%s.flv' % (server_id, file_id)
+        thumbnail = 'http://v%s.tinypic.com/%s_th.jpg' % (server_id, file_id)
+
+        return {
+            'id': file_id,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'title': title
+        }
\ No newline at end of file
index 35f89e9eecb0145988f8f503dcb6b18aea5d0e93..3b1a6fb61dbe84c3ce648de430674fb1c589740c 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -6,12 +8,13 @@ from .common import InfoExtractor
 class TrailerAddictIE(InfoExtractor):
     _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'
     _TEST = {
-        u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer',
-        u'file': u'76184.mp4',
-        u'md5': u'57e39dbcf4142ceb8e1f242ff423fd71',
-        u'info_dict': {
-            u"title": u"Prince Avalanche Trailer",
-            u"description": u"Trailer for Prince Avalanche.Two highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind."
+        'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer',
+        'md5': '41365557f3c8c397d091da510e73ceb4',
+        'info_dict': {
+            'id': '76184',
+            'ext': 'mp4',
+            'title': 'Prince Avalanche Trailer',
+            'description': 'Trailer for Prince Avalanche.\n\nTwo highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind.',
         }
     }
 
@@ -22,9 +25,15 @@ class TrailerAddictIE(InfoExtractor):
 
         title = self._search_regex(r'<title>(.+?)</title>',
                 webpage, 'video title').replace(' - Trailer Addict','')
-        view_count = self._search_regex(r'Views: (.+?)<br />',
-                webpage, 'Views Count')
-        video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1]
+        view_count_str = self._search_regex(
+            r'<span class="views_n">([0-9,.]+)</span>',
+            webpage, 'view count', fatal=False)
+        view_count = (
+            None if view_count_str is None
+            else int(view_count_str.replace(',', '')))
+        video_id = self._search_regex(
+            r'<param\s+name="movie"\s+value="/emb/([0-9]+)"\s*/>',
+            webpage, 'video id')
 
         # Presence of (no)watchplus function indicates HD quality is available
         if re.search(r'function (no)?watchplus()', webpage):
@@ -39,14 +48,16 @@ class TrailerAddictIE(InfoExtractor):
                 info_webpage, 'Download url').replace('%3F','?')
         thumbnail_url = self._search_regex(r'&image=(.+?)&',
                 info_webpage, 'thumbnail url')
-        ext = final_url.split('.')[-1].split('?')[0]
-
-        return [{
-            'id'          : video_id,
-            'url'         : final_url,
-            'ext'         : ext,
-            'title'       : title,
-            'thumbnail'   : thumbnail_url,
-            'description' : self._og_search_description(webpage),
-            'view_count'  : view_count,
-        }]
+
+        description = self._html_search_regex(
+            r'(?s)<div class="synopsis">.*?<div class="movie_label_info"[^>]*>(.*?)</div>',
+            webpage, 'description', fatal=False)
+
+        return {
+            'id': video_id,
+            'url': final_url,
+            'title': title,
+            'thumbnail': thumbnail_url,
+            'description': description,
+            'view_count': view_count,
+        }
index 74c82587f802acc82e3025235a4bfe92725e8111..7fa2b9e159ed1a60c056140f05f51851663830e9 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import json
 import re
 
@@ -10,48 +12,48 @@ from ..utils import (
 
 class UstreamIE(InfoExtractor):
     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
-    IE_NAME = u'ustream'
+    IE_NAME = 'ustream'
     _TEST = {
-        u'url': u'http://www.ustream.tv/recorded/20274954',
-        u'file': u'20274954.flv',
-        u'md5': u'088f151799e8f572f84eb62f17d73e5c',
-        u'info_dict': {
-            u"uploader": u"Young Americans for Liberty", 
-            u"title": u"Young Americans for Liberty February 7, 2012 2:28 AM"
-        }
+        'url': 'http://www.ustream.tv/recorded/20274954',
+        'file': '20274954.flv',
+        'md5': '088f151799e8f572f84eb62f17d73e5c',
+        'info_dict': {
+            "uploader": "Young Americans for Liberty",
+            "title": "Young Americans for Liberty February 7, 2012 2:28 AM",
+        },
     }
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('videoID')
 
-        video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
+        video_url = 'http://tcdn.ustream.tv/video/%s' % video_id
         webpage = self._download_webpage(url, video_id)
 
         self.report_extraction(video_id)
 
         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
-            webpage, u'title')
+            webpage, 'title')
 
         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
-            webpage, u'uploader', fatal=False, flags=re.DOTALL)
+            webpage, 'uploader', fatal=False, flags=re.DOTALL)
 
         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
-            webpage, u'thumbnail', fatal=False)
-
-        info = {
-                'id': video_id,
-                'url': video_url,
-                'ext': 'flv',
-                'title': video_title,
-                'uploader': uploader,
-                'thumbnail': thumbnail,
-               }
-        return info
+            webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'flv',
+            'title': video_title,
+            'uploader': uploader,
+            'thumbnail': thumbnail,
+        }
+
 
 class UstreamChannelIE(InfoExtractor):
     _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
-    IE_NAME = u'ustream:channel'
+    IE_NAME = 'ustream:channel'
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
index a4b26a26f4132840c57700fad96785dfb390a8db..e458ac961f701391c2fb651b1557ac7070dc6e2a 100644 (file)
@@ -1,5 +1,6 @@
+from __future__ import unicode_literals
+
 import re
-import json
 import xml.etree.ElementTree
 import datetime
 
@@ -22,16 +23,16 @@ class VevoIE(InfoExtractor):
            vevo:)
         (?P<id>[^&?#]+)'''
     _TESTS = [{
-        u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
-        u'file': u'GB1101300280.mp4',
-        u"md5": u"06bea460acb744eab74a9d7dcb4bfd61",
-        u'info_dict': {
-            u"upload_date": u"20130624",
-            u"uploader": u"Hurts",
-            u"title": u"Somebody to Die For",
-            u"duration": 230.12,
-            u"width": 1920,
-            u"height": 1080,
+        'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
+        'file': 'GB1101300280.mp4',
+        "md5": "06bea460acb744eab74a9d7dcb4bfd61",
+        'info_dict': {
+            "upload_date": "20130624",
+            "uploader": "Hurts",
+            "title": "Somebody to Die For",
+            "duration": 230.12,
+            "width": 1920,
+            "height": 1080,
         }
     }]
     _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
@@ -44,7 +45,7 @@ class VevoIE(InfoExtractor):
                 if version['version'] > last_version['version']:
                     last_version = version
         if last_version['version'] == -1:
-            raise ExtractorError(u'Unable to extract last version of the video')
+            raise ExtractorError('Unable to extract last version of the video')
 
         renditions = xml.etree.ElementTree.fromstring(last_version['data'])
         formats = []
@@ -85,7 +86,7 @@ class VevoIE(InfoExtractor):
             format_url = self._SMIL_BASE_URL + m.group('path')
             formats.append({
                 'url': format_url,
-                'format_id': u'SMIL_' + m.group('cbr'),
+                'format_id': 'SMIL_' + m.group('cbr'),
                 'vcodec': m.group('vcodec'),
                 'acodec': m.group('acodec'),
                 'vbr': int(m.group('vbr')),
@@ -101,26 +102,25 @@ class VevoIE(InfoExtractor):
         video_id = mobj.group('id')
 
         json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
-        info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
-        video_info = json.loads(info_json)['video']
+        video_info = self._download_json(json_url, video_id)['video']
 
         formats = self._formats_from_json(video_info)
         try:
             smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
                 self._SMIL_BASE_URL, video_id, video_id.lower())
             smil_xml = self._download_webpage(smil_url, video_id,
-                                              u'Downloading SMIL info')
+                                              'Downloading SMIL info')
             formats.extend(self._formats_from_smil(smil_xml))
         except ExtractorError as ee:
             if not isinstance(ee.cause, compat_HTTPError):
                 raise
             self._downloader.report_warning(
-                u'Cannot download SMIL information, falling back to JSON ..')
+                'Cannot download SMIL information, falling back to JSON ..')
 
         timestamp_ms = int(self._search_regex(
-            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))
+            r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))
         upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
-        info = {
+        return {
             'id': video_id,
             'title': video_info['title'],
             'formats': formats,
@@ -129,5 +129,3 @@ class VevoIE(InfoExtractor):
             'uploader': video_info['mainArtists'][0]['artistName'],
             'duration': video_info['duration'],
         }
-
-        return info
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
new file mode 100644 (file)
index 0000000..fbdff47
--- /dev/null
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+import datetime
+
+from .common import InfoExtractor
+
+
+class VubeIE(InfoExtractor):
+    IE_NAME = 'vube'
+    IE_DESC = 'Vube.com'
+    _VALID_URL = r'http://vube\.com/[^/]+/(?P<id>[\da-zA-Z]{10})'
+
+    _TEST = {
+        'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
+        'md5': 'f81dcf6d0448e3291f54380181695821',
+        'info_dict': {
+            'id': 'YL2qNPkqon',
+            'ext': 'mp4',
+            'title': 'Chiara Grispo - Price Tag by Jessie J',
+            'description': 'md5:8ea652a1f36818352428cb5134933313',
+            'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg',
+            'uploader': 'Chiara.Grispo',
+            'uploader_id': '1u3hX0znhP',
+            'upload_date': '20140103',
+            'duration': 170.56
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        video = self._download_json('http://vube.com/api/v2/video/%s' % video_id,
+            video_id, 'Downloading video JSON')
+
+        public_id = video['public_id']
+
+        formats = [{'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id),
+                   'height': int(fmt['height']),
+                   'abr': int(fmt['audio_bitrate']),
+                   'vbr': int(fmt['video_bitrate']),
+                   'format_id': fmt['media_resolution_id']
+                   } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed']
+
+        self._sort_formats(formats)
+
+        title = video['title']
+        description = video.get('description')
+        thumbnail = video['thumbnail_src']
+        if thumbnail.startswith('//'):
+            thumbnail = 'http:' + thumbnail
+        uploader = video['user_alias']
+        uploader_id = video['user_url_id']
+        upload_date = datetime.datetime.fromtimestamp(int(video['upload_time'])).strftime('%Y%m%d')
+        duration = video['duration']
+        view_count = video['raw_view_count']
+        like_count = video['total_likes']
+        dislike_count= video['total_hates']
+
+        comment = self._download_json('http://vube.com/api/video/%s/comment' % video_id,
+            video_id, 'Downloading video comment JSON')
+
+        comment_count = comment['total']
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
+        }
\ No newline at end of file
index d317f29f2fed1b62724b8589088b092e093e0c24..f6c515f7f8a5ad9b038b7f417800c3c833fb8ac3 100644 (file)
@@ -5,7 +5,6 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
-    unescapeHTML,
     ExtractorError,
 )
 
index 8b8a3977a3ec2bc96f4a454c96a5ebff6873823f..b5748c14e0bc50b693165177a0e1925e077b7fc2 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2014.01.30.1'
+__version__ = '2014.01.30.2'