python source code of bandcamp

# Copyright (C) 2015 Ariel George
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; version 2.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

"""Adds bandcamp album search support to the autotagger. Requires the
BeautifulSoup library.
"""

from __future__ import (division, absolute_import, print_function,
                        unicode_literals)

import beets.ui
from beets.autotag.hooks import AlbumInfo, TrackInfo, Distance
from beets import plugins
from beetsplug import fetchart
import beets
import requests
from bs4 import BeautifulSoup
import isodate
import six


USER_AGENT = 'beets/{0} +http://beets.radbox.org/'.format(beets.__version__)
BANDCAMP_SEARCH = 'http://bandcamp.com/search?q={query}&page={page}'
BANDCAMP_ALBUM = 'album'
BANDCAMP_ARTIST = 'band'
BANDCAMP_TRACK = 'track'
ARTIST_TITLE_DELIMITER = ' - '


class BandcampPlugin(plugins.BeetsPlugin):

    def __init__(self):
        super(BandcampPlugin, self).__init__()
        self.config.add({
            'source_weight': 0.5,
            'min_candidates': 5,
            'lyrics': False,
            'art': False,
            'split_artist_title': False
        })
        self.import_stages = [self.imported]
        self.register_listener('pluginload', self.loaded)

    def loaded(self):
        # Add our own artsource to the fetchart plugin.
        # FIXME: This is ugly, but i didn't find another way to extend fetchart
        # without declaring a new plugin.
        if self.config['art']:
            for plugin in plugins.find_plugins():
                if isinstance(plugin, fetchart.FetchArtPlugin):
                    plugin.sources = [BandcampAlbumArt(plugin._log, self.config)] + plugin.sources
                    fetchart.ART_SOURCES['bandcamp'] = BandcampAlbumArt
                    fetchart.SOURCE_NAMES[BandcampAlbumArt] = 'bandcamp'
                    break

    def album_distance(self, items, album_info, mapping):
        """Returns the album distance.
        """
        dist = Distance()
        if hasattr(album_info, 'data_source') and album_info.data_source == 'bandcamp':
            dist.add('source', self.config['source_weight'].as_number())
        return dist

    def candidates(self, items, artist, album, va_likely, extra_tags=None):
        """Returns a list of AlbumInfo objects for bandcamp search results
        matching an album and artist (if not various).
        """
        return self.get_albums(album)

    def album_for_id(self, album_id):
        """Fetches an album by its bandcamp ID and returns an AlbumInfo object
        or None if the album is not found.
        """
        # We use album url as id, so we just need to fetch and parse the
        # album page.
        url = album_id
        return self.get_album_info(url)

    def item_candidates(self, item, artist, album):
        """Returns a list of TrackInfo objects from a bandcamp search matching
        a singleton.
        """
        if item.title:
            return self.get_tracks(item.title)
        if item.album:
            return self.get_tracks(item.album)
        if item.artist:
            return self.get_tracks(item.artist)
        return []

    def track_for_id(self, track_id):
        """Fetches a track by its bandcamp ID and returns a TrackInfo object
        or None if the track is not found.
        """
        url = track_id
        return self.get_track_info(url)

    def imported(self, session, task):
        """Import hook for fetching lyrics from bandcamp automatically.
        """
        if self.config['lyrics']:
            for item in task.imported_items():
                # Only fetch lyrics for items from bandcamp
                if hasattr(item, 'data_source') and item.data_source == 'bandcamp':
                    self.add_lyrics(item, True)

    def get_albums(self, query):
        """Returns a list of AlbumInfo objects for a bandcamp search query.
        """
        albums = []
        for url in self._search(query, BANDCAMP_ALBUM):
            album = self.get_album_info(url)
            if album is not None:
                albums.append(album)
        return albums

    def get_album_info(self, url):
        """Returns an AlbumInfo object for a bandcamp album page.
        """
        try:
            html = self._get(url)
            name_section = html.find(id='name-section')
            album = name_section.find(attrs={'itemprop': 'name'}).text.strip()
            # Even though there is an item_id in some urls in bandcamp, it's not
            # visible on the page and you can't search by the id, so we need to use
            # the url as id.
            album_id = url
            artist = name_section.find(attrs={'itemprop': 'byArtist'}) .text.strip()
            release = html.find('meta', attrs={'itemprop': 'datePublished'})['content']
            release = isodate.parse_date(release)
            artist_url = url.split('/album/')[0]
            tracks = []
            for row in html.find(id='track_table').find_all(attrs={'itemprop': 'tracks'}):
                track = self._parse_album_track(row)
                track.track_id = '{0}{1}'.format(artist_url, track.track_id)
                tracks.append(track)

            return AlbumInfo(album, album_id, artist, artist_url, tracks,
                             year=release.year, month=release.month,
                             day=release.day, country='XW', media='Digital Media',
                             data_source='bandcamp', data_url=url)
        except requests.exceptions.RequestException as e:
            self._log.debug("Communication error while fetching album {0!r}: "
                            "{1}".format(url, e))
        except (TypeError, AttributeError) as e:
            self._log.debug("Unexpected html while scraping album {0!r}: {1}".format(url, e))
        except BandcampException as e:
            self._log.debug('Error: {0}'.format(e))

    def get_tracks(self, query):
        """Returns a list of TrackInfo objects for a bandcamp search query.
        """
        track_urls = self._search(query, BANDCAMP_TRACK)
        return [self.get_track_info(url) for url in track_urls]

    def get_track_info(self, url):
        """Returns a TrackInfo object for a bandcamp track page.
        """
        try:
            html = self._get(url)
            name_section = html.find(id='name-section')
            title = name_section.find(attrs={'itemprop': 'name'}).text.strip()
            artist_url = url.split('/track/')[0]
            artist = name_section.find(attrs={'itemprop': 'byArtist'}).text.strip()
            if self.config['split_artist_title']:
                artist_from_title, title = self._split_artist_title(title)
                if artist_from_title is not None:
                    artist = artist_from_title

            try:
                duration = html.find('meta', attrs={'itemprop': 'duration'})['content']
                track_length = float(duration)
                if track_length == 0:
                    track_length = None
            except TypeError:
                track_length = None

            return TrackInfo(title, url, length=track_length, artist=artist,
                             artist_id=artist_url, data_source='bandcamp',
                             media='Digital Media', data_url=url)
        except requests.exceptions.RequestException as e:
            self._log.debug("Communication error while fetching track {0!r}: "
                            "{1}".format(url, e))

    def add_lyrics(self, item, write = False):
        """Fetch and store lyrics for a single item. If ``write``, then the
        lyrics will also be written to the file itself."""
        # Skip if the item already has lyrics.
        if item.lyrics:
            self._log.info('lyrics already present: {0}', item)
            return

        lyrics = self.get_item_lyrics(item)

        if lyrics:
            self._log.info('fetched lyrics: {0}', item)
        else:
            self._log.info('lyrics not found: {0}', item)
            return

        item.lyrics = lyrics

        if write:
            item.try_write()
        item.store()

    def get_item_lyrics(self, item):
        """Get the lyrics for item from bandcamp.
        """
        try:
            # The track id is the bandcamp url when item.data_source is bandcamp.
            html = self._get(item.mb_trackid)
            lyrics = html.find(attrs={'class': 'lyricsText'})
            if lyrics:
                return lyrics.text
        except requests.exceptions.RequestException as e:
            self._log.debug("Communication error while fetching lyrics for track {0!r}: "
                            "{1}".format(item.mb_trackid, e))
        return None

    def _search(self, query, search_type=BANDCAMP_ALBUM, page=1):
        """Returns a list of bandcamp urls for items of type search_type
        matching the query.
        """
        if search_type not in [BANDCAMP_ARTIST, BANDCAMP_ALBUM, BANDCAMP_TRACK]:
            self._log.debug('Invalid type for search: {0}'.format(search_type))
            return None

        try:
            urls = []
            # Search bandcamp until min_candidates results have been found or
            # we hit the last page in the results.
            while len(urls) < self.config['min_candidates'].as_number():
                self._log.debug('Searching {}, page {}'.format(search_type, page))
                results = self._get(BANDCAMP_SEARCH.format(query=query, page=page))
                clazz = 'searchresult {0}'.format(search_type)
                for result in results.find_all('li', attrs={'class': clazz}):
                    a = result.find(attrs={'class': 'heading'}).a
                    if a is not None:
                        urls.append(a['href'].split('?')[0])

                # Stop searching if we are on the last page.
                if not results.find('a', attrs={'class': 'next'}):
                    break
                page += 1

            return urls
        except requests.exceptions.RequestException as e:
            self._log.debug("Communication error while searching page {0} for {1!r}: "
                            "{2}".format(page, query, e))
            return []

    def _get(self, url):
        """Returns a BeautifulSoup object with the contents of url.
        """
        headers = {'User-Agent': USER_AGENT}
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        return BeautifulSoup(r.text, 'html.parser')

    def _parse_album_track(self, track_html):
        """Returns a TrackInfo derived from the html describing a track in a
        bandcamp album page.
        """
        track_num = track_html['rel'].split('=')[1]
        track_num = int(track_num)

        title_html = track_html.find(attrs={'class': 'title-col'})
        title = title_html.find(attrs={'itemprop': 'name'}).text.strip()
        artist = None
        if self.config['split_artist_title']:
            artist, title = self._split_artist_title(title)
        track_url = title_html.find(attrs={'itemprop': 'url'})
        if track_url is None:
            raise BandcampException('No track url (id) for track {0} - {1}'.format(track_num, title))
        track_id = track_url['href']
        try:
            duration = title_html.find('meta', attrs={'itemprop': 'duration'})['content']
            duration = duration.replace('P', 'PT')
            track_length = isodate.parse_duration(duration).total_seconds()
        except TypeError:
            track_length = None

        return TrackInfo(title, track_id, index=track_num, length=track_length, artist=artist)

    def _split_artist_title(self, title):
        """Returns artist and title by splitting title on ARTIST_TITLE_DELIMITER.
        """
        parts = title.split(ARTIST_TITLE_DELIMITER)
        if len(parts) == 1:
            return None, title
        return parts[0], ARTIST_TITLE_DELIMITER.join(parts[1:])


class BandcampAlbumArt(fetchart.RemoteArtSource):
    NAME = u"Bandcamp"

    def get(self, album, plugin, paths):
        """Return the url for the cover from the bandcamp album page.
        This only returns cover art urls for bandcamp albums (by id).
        """
        if isinstance(album.mb_albumid, six.string_types) and 'bandcamp' in album.mb_albumid:
            try:
                headers = {'User-Agent': USER_AGENT}
                r = requests.get(album.mb_albumid, headers=headers)
                r.raise_for_status()
                album_html = BeautifulSoup(r.text, 'html.parser').find(id='tralbumArt')
                image_url = album_html.find('a', attrs={'class': 'popupImage'})['href']
                yield self._candidate(url=image_url,
                                      match=fetchart.Candidate.MATCH_EXACT)
            except requests.exceptions.RequestException as e:
                self._log.debug("Communication error getting art for {0}: {1}"
                                .format(album, e))
            except ValueError:
                pass


class BandcampException(Exception):
    pass