import datetime import os import re import subprocess import lxml.etree as et from lxml import objectify import requests from . import get_corpus_dir from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX, URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path) from .plos_regex import validate_doi from .elements import (parse_article_date, get_contrib_info, Journal, License, match_contribs_to_dicts) from .utils import dedent class Article: """The primary object of a PLOS article, initialized by a valid PLOS DOI. """ def __init__(self, doi, directory=None): """Creation of an article object. Usage: For the first time, you can use `article = Article(doi)` and then it and some attributes will be stored in memory. For creating articles after the first one, you can use: `article.doi = doi` This preserves the generic attributes and erases the article-specific ones (See also reset_memoized_attrs()) Use this to more rapidly iterate through different articles. :param doi: The Digital Object Identifier of the article :type doi: str :param directory: where the local article XML file is located, defaults to None :type directory: str, optional """ self.doi = doi self.directory = directory if directory else get_corpus_dir() self.reset_memoized_attrs() self._editor = None def __eq__(self, other): doi_eq = self.doi == other.doi dir_eq = self.directory == other.directory return doi_eq and dir_eq def __str__(self, exclude_refs=True): """Output when you print an article object on the command line. For parsing and viewing the XML of a local article. Should not be used for hashing Excludes <back> element (including references list) for easier viewing :param exclude_refs: remove references from the article tree (eases print viewing) """ parser = et.XMLParser(remove_blank_text=True) tree = et.parse(self.filepath, parser) if exclude_refs: root = tree.getroot() back = tree.xpath('./back') if back: root.remove(back[0]) local_xml = et.tostring(tree, method='xml', encoding='unicode', pretty_print=True) return local_xml def __repr__(self): """Value of an article object when you call it directly on the command line. Shows the DOI and title of the article :returns: DOI and title :rtype: {str} """ out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title) return out def _repr_html_(self): """Nice display for Jupyter notebook""" titlestyle = 'display:inline-flex;' titletextstyle = 'margin-left:.5em;' titlelink = ('<span style="{titlestyle}"><a href="{url}">' '<em>{title}</em></a></span>').format( url=self.page, title=self.title, titlestyle=titlestyle+titletextstyle, ) doilink = '<span><a href="{url}"><code>{doi}</code></a></span>'.format( url=self.doi_link(), doi=self.doi, ) out = dedent("""<div> <span style="{titlestyle}">Title: {titlelink}</span></br> <span>DOI: <span>{doilink} </div> """).format(doilink=doilink, titlelink=titlelink, titlestyle=titlestyle) return out def reset_memoized_attrs(self): """Reset attributes to None when instantiating a new article object. For article attributes that are memoized and specific to that particular article (including the XML tree and whether the xml file is in the local directory), reset them when creating a new article object. """ self._tree = None self._local = None self._contributors = None @property def doi(self): """The unique Digital Object Identifier for a PLOS article. See https://www.doi.org/ :returns: DOI of the article object :rtype: {str} """ return self._doi @property def text_viewer(self): """Command line application for viewing text to be used with open_in_viewer. Defaults to "open", which opens in whatever the default application is in your operating system for files ending in ".xml". Persists across article objects. Use with self.open_in_viewer() to open an article of interest. Check your text viewers documentation to learn how to launch it from the command line. For Sublime Text, see http://docs.sublimetext.info/en/latest/command_line/command_line.html :returns: command line shortcut for the text viewer :rtype: {str} """ try: return self._text_viewer except AttributeError as e: print(("{}:\n" "You need to assign a non-terminal text viewer " "command able to be run on the CLI to self.text_viewer" ).format(e)) @text_viewer.setter def text_viewer(self, value="open"): """Sets the text viewer for all article objects. :param value: from self.text_viewer :type value: {str} """ self._text_viewer = value @doi.setter def doi(self, d): """ Using regular expressions, make sure the doi is valid before instantiating the article object. """ if validate_doi(d) is False: raise Exception("Invalid format for PLOS DOI: {}".format(d)) self.reset_memoized_attrs() self._doi = d def doi_link(self): """The link of the DOI, which redirects to the journal URL.""" return doi_url + self.doi def get_remote_xml(self): """For an article, parse its XML file at the location of self.url. Uses the lxml element tree to create the string, which is saved to a local file when downloaded :returns: string of entire remote article file :rtype: {str} """ remote_xml = et.tostring(self.remote_tree, method='xml', encoding='unicode') return remote_xml def open_in_viewer(self, text_viewer=None): """Open a local article file of interest in an external text viewer. :param text_viewer: set via self.text_viewer, defaults to None :type text_viewer: str, optional :raises: TypeError """ if not (text_viewer or self.text_viewer): raise TypeError("You have not specified an text_viewer. Please do so.") subprocess.call([self._text_viewer, self.filepath]) def open_in_browser(self): """Opens the landing page (HTML) of an article in default browser. This is also the URL that the DOI resolves to """ subprocess.call(["open", self.page]) def get_element_xpath(self, tag_path_elements=None, remote=False): """For a local article's root element, grab particular sub-elements via XPath location. Defaults to reading the element location for uncorrected proofs/versions of record The basis of every method and property looking for particular metadata fields :param article_root: the xml file for a single article :param tag_path_elements: xpath location in the XML tree of the article file :param remote: whether using the remote XML in self.remote_tree (defaults to False) :return: list of elements in the article with that xpath location """ if tag_path_elements is None: tag_path_elements = ('/', 'article', 'front', 'article-meta', 'custom-meta-group', 'custom-meta', 'meta-value') tag_location = '/'.join(tag_path_elements) if remote: root = self.remote_tree.getroot() else: root = self.root return root.xpath(tag_location) def get_dates(self, string_=False, string_format='%Y-%m-%d'): """For an individual article, get all of its dates, including publication date (pubdate), submission date. Defaults to datetime objects :param string_: whether to return dates as a dictionary of strings :param string_format: if string_ is True, the format to return the dates in :return: dict of date types mapped to datetime objects for that article :rtype: {dict} """ dates = {} # first location is where pubdate and date added to collection are tag_path_1 = ["/", "article", "front", "article-meta", "pub-date"] element_list_1 = self.get_element_xpath(tag_path_elements=tag_path_1) for element in element_list_1: pub_type = element.get('pub-type') try: date = parse_article_date(element) except ValueError: print('Error getting pubdates for {}'.format(self.doi)) date = '' dates[pub_type] = date # second location is where historical dates are, including submission and acceptance tag_path_2 = ["/", "article", "front", "article-meta", "history"] element_list_2 = self.get_element_xpath(tag_path_elements=tag_path_2) for element in element_list_2: for part in element: date_type = part.get('date-type') try: date = parse_article_date(part) except ValueError: print('Error getting history dates for {}'.format(self.doi)) date = '' dates[date_type] = date # third location is for vor updates when it's updated (see `proof(self)`) rev_date = '' if self.proof == 'vor_update': tag_path = ('/', 'article', 'front', 'article-meta', 'custom-meta-group', 'custom-meta') xpath_results = self.get_element_xpath(tag_path_elements=tag_path) for result in xpath_results: if result.xpath('./meta-name')[0].text == 'Publication Update': rev_date_string = result.xpath('./meta-value')[0].text rev_date = datetime.datetime.strptime(rev_date_string, '%Y-%m-%d') break else: pass dates['updated'] = rev_date if string_: # can return dates as strings instead of datetime objects if desired for key, value in dates.items(): if value: dates[key] = value.strftime(string_format) return dates def dates_debug(self): """Whether the dates in self.get_dates() are in the correct order. check whether date received is before date accepted, is before pubdate accounts for potentially missing date fields :return: if dates are in right order or not :rtype: bool """ dates = self.get_dates() if dates.get('received', '') and dates.get('accepted', ''): if dates['received'] <= dates['accepted'] <= dates['epub']: order_correct = True else: order_correct = False elif dates.get('received', ''): if dates['received'] <= dates['epub']: order_correct = True else: order_correct = False elif dates.get('accepted', ''): if dates['accepted'] <= dates['epub']: order_correct = True else: order_correct = False else: order_correct = True return order_correct @property def volume(self): """Volume of the article.""" return int(self.root.xpath('/article/front/article-meta/volume')[0].text) @property def issue(self): """Issue of the article.""" return int(self.root.xpath('/article/front/article-meta/issue')[0].text) @property def elocation(self): """Elocation ID of the article.""" return self.root.xpath('/article/front/article-meta/elocation-id')[0].text def get_aff_dict(self): """For a given PLOS article, get list of contributor-affiliated institutions. Uses "rid"s to map individual contributors to their institutions More about rids: https://jats.nlm.nih.gov/archiving/tag-library/1.1/attribute/rid.html See also get_rid_dict() :returns: Dictionary of footnote ids to institution information :rtype: {dict} """ tags_to_aff = ["/", "article", "front", "article-meta"] article_aff_elements = self.get_element_xpath(tag_path_elements=tags_to_aff) aff_dict = {} aff_elements = [el for aff_element in article_aff_elements for el in aff_element.getchildren() ] for el in aff_elements: if el.tag == 'aff': if el.getchildren(): for sub_el in el.getchildren(): if sub_el.tag == 'addr-line': try: aff_text_fixed = ' '.join([aff_string.strip() for aff_string in sub_el.text.splitlines()]) except AttributeError: aff_text_fixed = et.tostring(sub_el, encoding='unicode', method='text') aff_dict[el.attrib['id']] = aff_text_fixed else: # the address for some affiliations is not wrapped in an addr-line tag aff_dict[el.attrib['id']] = el.text.replace('\n', '').replace('\r', '').replace('\t', '') return aff_dict def get_fn_dict(self): """For a given PLOS article, get list of footnotes. Used with rids to map individual contributors to their institutions More about rids: https://jats.nlm.nih.gov/archiving/tag-library/1.1/attribute/rid.html See also get_rid_dict() :returns: Dictionary of footnote ids to institution information :rtype: {dict} """ tags_to_fn = ["/", "article", "front", "article-meta", "author-notes"] article_fn_elements = self.get_element_xpath(tag_path_elements=tags_to_fn) fn_dict = {} fn_elements = [el for fn_element in article_fn_elements for el in fn_element.getchildren() ] for el in fn_elements: if el.attrib.get('id'): if el.getchildren(): for sub_el in el.getchildren(): if sub_el.tag == 'email': pass else: fn_dict[el.attrib['id']] = sub_el.text else: # in case is at top-level of element fn_dict[el.attrib['id']] = el.text.replace('\n', '').replace('\r', '').replace('\t', '') return fn_dict def get_corr_author_emails(self): """For an article, grab the email addresses of the corresponding authors. Parses the list of emails and groups by rid or by initials, if present. Can handle multiple emails for multiple authors if formatted correctly. The email addresses are in an element of author notes. While most articles have one corresponding author with one email address, sometimes there are 1) multiple authors, and/or 2) multiple emails per author. In the first case, author initials are used in the text to separate emails. In the second case, a comma is used to separate emails. Initials are how emails can be matched to multiple authors. See also `match_author_names_to_emails()` for the back-up method of name matching. :return: dictionary of rid or author initials mapped to list of email address(es) :rtype: {dict} """ tag_path = ["/", "article", "front", "article-meta", "author-notes"] try: author_notes_element = self.get_element_xpath(tag_path_elements=tag_path)[0] except IndexError: # no emails found return {} corr_emails = {} email_list = [] for note in author_notes_element: if note.tag == 'corresp': author_info = note.getchildren() for i, item in enumerate(author_info): # if author initials are in the same field as email address if item.tag == 'email' and item.text and all(x in item.text for x in ('(', ')')): email_info = item.text.split(' ') for i, info in enumerate(email_info): # prune out non-letters from initials & email email_info[i] = re.sub(r'[^a-zA-Z0-9=@\.+-]', '', info) try: corr_emails[email_info[1]] = [email_info[0]] except IndexError: print('Error parsing emails for {}'.format(self.doi)) pass # if no author initials (one corr author) elif item.tag == 'email' and item.tail is None and item.text: email_list.append(item.text) if item.text == '': print('No email available for {}'.format(self.doi)) if note.attrib['id']: corr_emails[note.attrib['id']] = email_list else: corr_emails['cor001'] = email_list # if more than one email per author; making sure no initials present (comma ok) elif item.tag == 'email' and re.sub(r'[^a-zA-Z0-9=]', '', str(item.tail)) is None: try: if author_info[i+1].tail is None: email_list.append(item.text) elif author_info[i+1].tail: corr_initials = re.sub(r'[^a-zA-Z0-9=]', '', author_info[i+1].tail) if not corr_emails.get(corr_initials): corr_emails[corr_initials] = [item.text] else: corr_emails[corr_initials].append(item.text) except IndexError: email_list.append(item.text) corr_emails[note.attrib['id']] = email_list if i > 1: print('Error handling multiple email addresses for {} in {}' .format(et.tostring(item), self.doi)) if item.text == '': print('No email available for {}'.format(self.doi)) # if author initials included (more than one corr author) elif item.tag == 'email' and item.tail: corr_email = item.text corr_initials = re.sub(r'[^a-zA-Z0-9=]', '', item.tail) if not corr_initials: try: corr_initials = re.sub(r'[^a-zA-Z0-9=]', '', author_info[i+1].tail) except (IndexError, TypeError) as e: corr_initials = note.attrib['id'] if not corr_initials: print('email parsing is weird for', self.doi) if not corr_emails.get(corr_initials): corr_emails[corr_initials] = [corr_email] else: corr_emails[corr_initials].append(corr_email) else: pass if not corr_emails: author_notes_field = et.tostring(author_notes_element, method='text', encoding='unicode') if '@' in author_notes_field: regex_email = r'[\w\.-]+@[\w\.-]+' email_finder = re.compile(regex_email) email_list = email_finder.findall(author_notes_field) if email_list: corr_emails['cor001'] = email_list return corr_emails def get_contributions_dict(self): """For articles that don't use the CREDiT taxonomy, compile a dictionary of author contribution types matched to author initials. Work in progress!! Works for highly formatted lists with subelements (e.g. '10.1371/journal.pone.0170354') and structured single strings (e.g. '10.1371/journal.pone.0050782'), but still fails for unusual strings (e.g, '10.1371/journal.pntd.0000072') See also get_credit_taxonomy() for the CREDiT taxonomy version. TODO: Use regex to properly separate author roles from initials for unusual strings. :return: dictionary mapping author initials to their author contributions/roles. """ if self.type_ in ['correction', 'retraction', 'expression-of-concern']: # these article types don't have proper 'authors' return {} tag_path = ["/", "article", "front", "article-meta", "author-notes"] try: author_notes_element = self.get_element_xpath(tag_path_elements=tag_path)[0] except IndexError: return {} author_contributions = {} contrib_dict = {} initials_list = [] for note in author_notes_element: if note.attrib.get('fn-type', None) == 'con': try: # for highly structured lists with sub-elements for each item # Example: 10.1371/journal.pone.0170354' con_element = note[0][0] con_list = con_element.getchildren() for con_item in con_list: try: contribution = con_item[0][0].text.rstrip(':') contributor_initials = (con_item[0][0].tail.lstrip(' ').rstrip('.')).split(' ') initials_list.extend(contributor_initials) contrib_dict[contribution] = contributor_initials except (IndexError, AttributeError) as e: print('Error parsing contributions item {}: {}'.format(self.doi, et.tostring(con_item, encoding='unicode', method='xml'))) pass except IndexError: # for single strings, though it doesn't parse all of them correctly. # Example: '10.1371/journal.pone.0050782' contributions = note[0].text if contributions is None: print('Error parsing contributions for {}: {}'.format(self.doi, et.tostring(con_element, encoding='unicode', method='xml'))) return {} contribution_list = re.split(': |\. ', contributions) contribb_dict = dict(list(zip(contribution_list[::2], contribution_list[1::2]))) for k, v in contribb_dict.items(): v_new = v.split(' ') v_new = [v.rstrip('.').strip('\n') for v in v_new] contrib_dict[k.strip('\n')] = v_new initials_list.extend(v_new) for initials in (set(initials_list)): contrib_list = [] for k, v in contrib_dict.items(): if initials in v: contrib_list.append(k) author_contributions[initials] = contrib_list return author_contributions def get_contributors_info(self): """Get and organize information about each contributor for an article. This includes both authors and editors of the article. This function both creates article-level dictionaries of contributor information, as well as parses individual <contrib> elements. It reconciles the dicts together using a number of external functions from article_elements.py :returns: dictionary of metadata for each <contrib> element :rtype: list of dicts """ # TODO: param to remove unnecessary fields (initials) and dicts (rid_dict) # TODO: also get funding information, data availability, COI, etc # get dictionary of ids to institutional affiliations & all other footnotes aff_dict = self.get_aff_dict() fn_dict = self.get_fn_dict() aff_dict.update(fn_dict) matching_error = False # get dictionary of corresponding author email addresses email_dict = self.get_corr_author_emails() # get author contributions (if no credit taxonomy) credit_dict = self.get_contributions_dict() # get list of contributor elements (one per contributor) tag_path = ["/", "article", "front", "article-meta", "contrib-group", "contrib"] contrib_list = self.get_element_xpath(tag_path_elements=tag_path) contrib_dict_list = [] error_printed = False # iterate through each contributor for contrib in contrib_list: # initialize contrib dict with default fields contrib_keys = ['contrib_initials', 'given_names', 'surname', 'group_name', 'ids', 'rid_dict', 'contrib_type', 'author_type', 'editor_type', 'email', 'affiliations', 'author_roles', 'footnotes' ] contrib_dict = dict.fromkeys(contrib_keys, None) try: contrib_dict.update(get_contrib_info(contrib)) except TypeError: # minimize number of times this prints out if not error_printed: print('Error getting contrib info for {}'.format(self.doi, self.type_)) error_printed = True else: pass # map affiliation footnote ids to the actual institutions try: if contrib_dict.get('rid_dict', None) is not None: contrib_dict['affiliations'] = [aff_dict.get(aff, "") for k, v in contrib_dict['rid_dict'].items() for aff in v if k == 'aff' ] except (TypeError, AttributeError) as e: print('error constructing affiliations for {}: {} {}' .format(self.doi, contrib_dict.get('given_names'), contrib_dict.get('surname'))) contrib_dict['affiliations'] = [""] try: contrib_dict['footnotes'] = [aff_dict.get(aff, "") for k, v in contrib_dict['rid_dict'].items() for aff in v if k == 'fn' ] except AttributeError: print('error constructing footnote matches for {}: {} {}' .format(self.doi, contrib_dict.get('given_names'), contrib_dict.get('surname'))) contrib_dict['affiliations'] = [""] # make list of all contribs contrib_dict_list.append(contrib_dict) # match authors to credit_dicts (from author notes) if necessary if credit_dict: author_list = [author for author in contrib_dict_list if author.get('contrib_type', None) == 'author'] author_list, credit_matching_error = match_contribs_to_dicts(author_list, credit_dict, contrib_key='author_roles') for author in author_list: role_list = author.get('author_roles', None) author['author_roles'] = {'author_notes': role_list} if credit_matching_error: print('Warning: authors not matched correctly to author_roles for {}' .format(self.doi)) # match corresponding authors to email addresses corr_author_list = [contrib for contrib in contrib_dict_list if contrib.get('author_type', None) == 'corresponding'] if not corr_author_list and email_dict: print('Email but no corresponding author found for {}'.format(self.doi)) # matching_error = True if corr_author_list and not email_dict: print('Corr emails not found for {}'.format(self.doi)) matching_error = True if len(corr_author_list) == 1: corr_author = corr_author_list[0] try: corr_author['email'] = email_dict[corr_author['rid_dict']['corresp'][0]] except KeyError: if len(email_dict) == 1: corr_author['email'] = list(email_dict.values())[0] else: print('one_corr_author error finding email for {} in {}'.format(corr_author, email_dict)) matching_error = True elif email_dict and len(corr_author_list) > 1 and len(set([tuple(x) for x in email_dict.values()])) > 1: corr_author_list, matching_error = match_contribs_to_dicts(corr_author_list, email_dict, contrib_key='email') elif len(corr_author_list) > 1: if email_dict and (len(email_dict) == 1 or len(set([tuple(x) for x in email_dict.values()])) == 1): # if there's only one email address, use it for all corr authors for corr_author in corr_author_list: corr_author['email'] = list(email_dict.values())[0] else: matching_error = True else: corr_author_list = [] match_error_printed = False if email_dict and len(email_dict) > len(corr_author_list) > 0: print('Contributing author email included for {}' .format(self.doi)) match_error_printed = True elif email_dict and 1 < len(email_dict) < len(corr_author_list): print('{} corresponding author email(s) missing for {}' .format(len(corr_author_list) - len(email_dict), self.doi)) match_error_printed = True if matching_error and email_dict and not match_error_printed: print('Warning: corresponding authors not matched correctly to email addresses for {}' .format(self.doi)) return contrib_dict_list def get_related_dois(self): """For a given article, get the list of DOIs of related PLOS articles. Creates a dictionary of related dois & their type from the <related-articles> xpath location Use primarily to map amendment notifications to articles that have been amended :return: dictionary of related DOIs :rtype: dict """ related_article_elements = self.get_element_xpath(tag_path_elements=["/", "article", "front", "article-meta", "related-article"]) related_article_dict = {} if related_article_elements: for elem in related_article_elements: related_doi = elem.attrib related_article = related_doi['{http://www.w3.org/1999/xlink}href'] related_article = related_article.lstrip('info:doi/') if not related_article_dict.get(elem.attrib['related-article-type'], None): # begin building the list of DOIs with that related-article-type related_article_dict[elem.attrib['related-article-type']] = [related_article] else: # there is more than one article with the same related-article-type related_article_dict[elem.attrib['related-article-type']].append(related_article) else: # no related articles exist pass return related_article_dict def check_if_link_works(self): """See if a link is valid (i.e., returns a '200' to the HTML request). Used for checking a URL to a PLOS article's landing page or XML file on journals.plos.org Full list of potential status codes: https://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html :return: boolean if HTTP status code returned available or unavailable, "error" if a different status code is returned than 200 or 404 """ request = requests.get(self.url) if request.status_code == 200: return True elif request.status_code == 404: return False else: return 'error' def check_if_doi_resolves(self, plos_valid=True): """Whether a PLOS DOI resolves via dx.doi.org to the correct article landing page. If the link works, make sure that it points to the same DOI Checks first if it's a valid DOI or see if it's a redirect. :return: 'works' if works as expected, 'doesn't work' if it doesn't resolve correctly, or if the metadata DOI doesn't match self.doi, return the metadata DOI """ if plos_valid and validate_doi(self.doi) is False: return "Not valid PLOS DOI structure" url = "http://dx.doi.org/" + self.doi if self.check_if_link_works() is True: headers = {"accept": "application/vnd.citationstyles.csl+json"} r = requests.get(url, headers=headers) r_doi = r.json()['DOI'] if r_doi == self.doi: return "works" else: return r_doi else: return "doesn't work" @property def xml(self): """Returns string from local xml file. """ if self.tree is None: return None else: local_xml = et.tostring(self.tree, method='xml', encoding='unicode') return local_xml @property def tree(self): """The element tree object created from an article's local XML file See http://lxml.de/api/lxml.etree._ElementTree-class.html After accessing tree for the first time, it stores as an attribute :returns: article's element tree :rtype: {lxml.etree._ElementTree-class} or None """ if self._tree is None: if self.local: local_element_tree = et.parse(self.filepath) self._tree = local_element_tree else: print("Local article file not found: {}".format(self.filepath)) return None else: pass return self._tree @property def root(self): """Get the root (base) element of an article. """ return self.tree.getroot() def get_page(self, page_type='article'): """Get any of the PLOS URLs associated with a particular DOI. Based on `get_page_base()`, which customizes the beginning URL by journal. :param page_type: one of the keys in `plos_page_dict`, defaults to article """ BASE_LANDING_PAGE = _get_base_page(self.journal) try: page = BASE_LANDING_PAGE + LANDING_PAGE_SUFFIX.format(plos_page_dict[page_type], self.doi) if page_type == 'assetXMLFile': page += URL_SUFFIX except KeyError: raise Exception('Invalid page_type; value must be one of the following: {}'.format(list(plos_page_dict.keys()))) return page @property def page(self): """ The URL of the landing page for an article. Where to access an article's HTML version """ return self.get_page() @property def url(self): """The direct url of an article's XML file. """ return self.get_page(page_type='assetXMLFile') @property def taxonomy(self): """Taxonomy information. For a complete list of subject areas see https://github.com/PLOS/plos-thesaurus """ tag_path_elements = ('/', 'article', 'front', 'article-meta', 'article-categories') e_list = self.get_element_xpath(tag_path_elements=tag_path_elements) subjs_dict = {} for subj in e_list[0].getchildren(): try: sbjindex = subj.values()[0].strip() if sbjindex in subjs_dict: subjs_dict[sbjindex].append(tuple(e.text for e in subj.iter('subject'))) else: subjs_dict[sbjindex] = [tuple(e.text for e in subj.iter('subject'))] except IndexError: if 'No subject' in subjs_dict: subjs_dict['No subject'].append(tuple(e.text for e in subj.iter('subject'))) else: subjs_dict['No subject'] = [tuple(e.text for e in subj.iter('subject'))] return subjs_dict @property def filepath(self): """The path on the local file system to a given article's XML file. """ if 'annotation' in self.doi: article_path = os.path.join(self.directory, 'plos.correction.' + self.doi.split('/')[-1] + '.xml') else: article_path = os.path.join(self.directory, self.doi.lstrip('10.1371/') + '.xml') return article_path @property def filename(self): """The basename of the article's XML file. """ return os.path.basename(self.filepath) @property def local(self): """Boolean of whether the article is stored locally or not. Stored as attribute after first access """ if self._local is None: self._local = os.path.isfile(self.filepath) else: pass return self._local @property def proof(self): """ For a single article in a directory, check whether it is an 'uncorrected proof' or a 'VOR update' to the uncorrected proof, or neither. :return: proof status if it exists :rtype: str """ xpath_results = self.get_element_xpath() proof = '' for result in xpath_results: if result.text == 'uncorrected-proof': proof = 'uncorrected_proof' elif result.text == 'vor-update-to-uncorrected-proof': proof = 'vor_update' return proof @property def remote_proof(self): """ For a single article online, check whether it is an 'uncorrected proof' or a 'VOR update' to the uncorrected proof, or neither. :return: proof status if it exists; otherwise, None """ xpath_results = self.get_element_xpath(remote=True) proof = '' for result in xpath_results: if result.text == 'uncorrected-proof': proof = 'uncorrected_proof' elif result.text == 'vor-update-to-uncorrected-proof': proof = 'vor_update' return proof @property def remote_tree(self): """Gets the lxml element tree of an article from its remote URL. Can compare local (self.xml) to remote versions of XML :returns: article's online element tree :rtype: {lxml.etree._ElementTree-class} """ return et.fromstring(requests.get(self.url).content) @property def journal(self): """Journal that an article was published in. Can be PLOS Biology, Medicine, Neglected Tropical Diseases, Pathogens, Genetics, Computational Biology, ONE, or the now defunct Clinical Trials. Relies on a simple doi_to_journal transform when possible, and uses `Journal().parse_plos_journal()` for the "annotation" DOIs that don't have that journal information in the DOI. """ if 'annotation' not in self.doi: journal = Journal.doi_to_journal(self.doi) else: journal_meta = self.root.xpath('/article/front/journal-meta')[0] journal = str(Journal(journal_meta)) return journal @property def title(self): """For an individual PLOS article, get its title. :return: string of article title at specified xpath location """ title = self.get_element_xpath(tag_path_elements=["/", "article", "front", "article-meta", "title-group", "article-title"]) title_text = et.tostring(title[0], encoding='unicode', method='text', pretty_print=True) title_cleaned = " ".join(title_text.split()) return title_cleaned @property def rich_title(self): """For an individual PLOS article, get its title with HTML formatting. Preserves HTML formatting but removes all other XML tagging, namespace/xlink info, etc. Doesn't do xpath directly on `self.root` so can deannotate separate object See http://lxml.de/objectify.html#how-data-types-are-matched for more info on deannotate process Exceptions that still need handling: 10.1371/journal.pone.0179720, 10.1371/journal.pone.0068479, 10.1371/journal.pone.0069681, 10.1371/journal.pone.0068965, 10.1371/journal.pone.0083868, 10.1371/journal.pone.0069554, 10.1371/journal.pone.0068324, 10.1371/journal.pone.0067986, 10.1371/journal.pone.0068704, 10.1371/journal.pone.0068492, 10.1371/journal.pone.0068764, 10.1371/journal.pone.0068979, 10.1371/journal.pone.0068544, 10.1371/journal.pone.0069084, 10.1371/journal.pone.0069675 :return: string of article title at specified xpath location """ root = self.root objectify.deannotate(root, cleanup_namespaces=True, xsi_nil=True) art_title = root.xpath("/article/front/article-meta/title-group/article-title") art_title = art_title[0] try: text = art_title.text if text is None: text = '' text += ''.join(et.tostring(child, encoding='unicode') if child.tag not in ('ext-link', 'named-content', 'sc', 'monospace') \ else child.text + child.tail if child.tail is not None \ else child.text for child in art_title.getchildren()) title = text.replace(' xmlns:xlink="http://www.w3.org/1999/xlink"', '') \ .replace(' xmlns:mml="http://www.w3.org/1998/Math/MathML"', '') \ .replace(' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance', '') except TypeError: # try to rewrite so this isn't needed print('Error processing article title for {}'.format(self.doi)) title = et.tostring(art_title, method='text', encoding='unicode') return title @property def pubdate(self): """The date an article was published online. :returns: article publication date :rtype: {datetime.datetime} """ dates = self.get_dates() return dates['epub'] @property def revdate(self): """The date an article's version-of-record (`proof(self)` == 'vor_update') was published online. :returns: article revision date :rtype: {datetime.datetime} """ dates = self.get_dates() return dates['updated'] @property def license(self): """Return dictionary of CC license information from the license field.""" permissions = self.root.xpath('/article/front/article-meta/permissions')[0] return dict(License(permissions, self.doi)) @property def contributors(self): """ List of contributors to an article. Including authors and editors Stores as attribute after first access :returns: list of dictionaries for each contributor :rtype: {list} """ if self._contributors is None: self._contributors = self.get_contributors_info() else: pass return self._contributors @property def authors(self): """List of authors of an article. Including contributing and corresponding. For more about authorship criteria, see https://journals.plos.org/plosone/s/authorship """ contributors = self.contributors return [contrib for contrib in contributors if contrib.get('contrib_type', None) == 'author'] @property def corr_author(self): """List of corresponding authors of an article. """ contributors = self.contributors return [contrib for contrib in contributors if contrib.get('author_type', None) == 'corresponding'] @property def editor(self): """The editor on the article. For more about the editorial process, see https://journals.plos.org/plosone/s/editorial-and-peer-review-process """ contributors = self.contributors return [contrib for contrib in contributors if contrib.get('contrib_type', None) == 'editor'] @property def emails(self): """List of emails of corresponding author(s). Unlike get_corr_author_emails() dict, it does not differentiate by author. Joins multiple emails into a single list. :return: list of corresponding author email addresses """ email_dict = self.get_corr_author_emails() email_list = [] for k, v in email_dict.items(): email_list.extend(v) return email_list def emails_to_string(self): """Produces string of emails of corresponding author(s). Joins multiple emails into a single string, separated by semi-colons. Used for exporting to .csv :return: string of corresponding author email addresses """ return '; '.join(self.emails) @property def type_(self): """For an article file, get its JATS article type. Used primarily to find Correction (and thereby corrected) articles :return: JATS article_type at that xpath location """ type_element_list = self.get_element_xpath(tag_path_elements=["/", "article"]) return type_element_list[0].attrib['article-type'] @property def plostype(self): """For an article file, get its PLOS article type. This format is less standardized than the JATS article type (self.type_) :return: PLOS article_type at that xpath location """ article_categories = self.get_element_xpath(tag_path_elements=["/", "article", "front", "article-meta", "article-categories"]) subject_list = article_categories[0].getchildren() for i, subject in enumerate(subject_list): if subject.get('subj-group-type') == "heading": subject_instance = subject_list[i][0] s = '' for text in subject_instance.itertext(): s = s + text plos_article_type = s return plos_article_type @property def dtd(self): """Document Type Definition for an article. For more information on these DTD tagsets, see https://jats.nlm.nih.gov/1.1d3/ and https://dtd.nlm.nih.gov/3.0/ """ dtd = self.get_element_xpath(tag_path_elements=["/", "article"]) try: dtd = dtd[0].attrib['dtd-version'] if str(dtd) == '3.0': dtd = 'NLM 3.0' elif dtd == '1.1d3': dtd = 'JATS 1.1d3' except KeyError: print('Error parsing DTD from', self.doi) dtd = 'N/A' return dtd @property def abstract(self): """For an individual PLOS article, get the string of the abstract content. PLOS articles can have multiple abstract fields at the same XPath location, however the actual abstract is distinguished by having no attributes (`[count(@*)=0]`). Info about the article abstract: https://journals.plos.org/plosone/s/submission-guidelines#loc-abstract :return: plain-text string of content in abstract """ abstract_list = self.get_element_xpath(tag_path_elements=["/", "article", "front", "article-meta", "abstract[count(@*)=0]"]) if abstract_list: abstract = abstract_list[0] assert len(abstract_list) == 1 abstract_text = et.tostring(abstract[0], encoding='unicode', method='text') else: if self.type_ == 'research-article' and self.plostype == 'Research Article': print('No abstract found for research article {}'.format(self.doi)) abstract_text = '' # clean up text: rem white space, new line marks, blank lines abstract_text = abstract_text.strip().replace(' ', '') abstract_text = os.linesep.join([s for s in abstract_text.splitlines() if s]) return abstract_text @property def body(self): """ For an individual PLOS article, get the string of the body content. :returns: main body of the article :rtype: {str} """ xml_tree = et.parse(self.filename) root = xml_tree.getroot() # limit the text to the body section body = root.find('./body') # remove supplementary material section for sec in body.findall('.//sec'): if 'supplementary-material' in [element.tag for element in sec]: parent=sec.getparent() parent.remove(sec) # remove unwanted xml elements remove_list = ['tr','td','caption','fig','graphic','table-wrap','ext-link'] for tag in remove_list: for element in body.findall('.//'+tag): parent=element.getparent() parent.remove(element) # convert XML to string body_text = et.tostring(body, encoding='utf8', method='text').decode() return(body_text) @property def amendment(self): """Whether the JATS article type is a correction, retraction, or expression of concern. These are the three article types ('amendments') that potentially warrant a change in the original article that they reference (i.e., the 'related-doi'.) See https://jats.nlm.nih.gov/archiving/tag-library/1.1/attribute/article-type.html :returns: True if an amendment article type, False if not :rtype: {bool} """ if self.type_ in ['correction', 'retraction', 'expression-of-concern']: return True else: return False @property def related_dois(self): """PLOS DOIs related to current article. Compresses the values of `self.get_related_dois()` dictionary into a single list of DOI strings More strict for which keys to include for corrections, retractions, and expressions of concern, the three amendment article types. :returns: list of related DOIs :rtype: list """ doi_list = [] related_doi_dict = self.get_related_dois() if self.amendment: # only use certain keys if an amendment article if self.type_ == 'correction': attrib_name = 'corrected-article' elif self.type_ == 'retraction': attrib_name = 'retracted-article' elif self.type_ == 'expression-of-concern': attrib_name = 'object-of-concern' for k, v in related_doi_dict.items(): if k == attrib_name: doi_list = v break if not doi_list: doi_list = [related for related_list in related_doi_dict.values() for related in related_list] print('{} has incorrect related_doi field attribute'.format(self.doi)) else: # flatten all dict values if not an amendment article if related_doi_dict: for k, v in related_doi_dict.items(): doi_list.extend(v) return doi_list @property def correction(self): """Get the DOIs of all corrections type articles that correct the current article. Some PLOS articles include a 'correction-forward' related-article-type, meaning an article that has been issued a correction is linked to its correcting article(s). Only for the SIX PLOS journals (i.e. not on PLOS ONE). Usually there is only one DOI, unless the article has been issued multiple corrections. :return: DOIs of the correction articles :rtype: list """ correction_doi = '' related_dois = self.get_related_dois() for k, v in related_dois.items(): if k == 'correction-forward': correction_doi = v break return correction_doi @property def counts(self): """For a single article, return a dictionary of the several counts functions that are available. Dictionary format for XML tags: {figures: fig-count, pages: page-count, tables: table-count} For articles without the figure and table counts fields, calculates those values using XPath. :return: counts dictionary of number of figures, pages, and tables in the article """ counts = {} tag_path = ["/", "article", "front", "article-meta", "counts"] count_element_list = self.get_element_xpath(tag_path_elements=tag_path) for count_element in count_element_list: for count_item in count_element: count = count_item.get('count') count_type = count_item.tag counts[count_type] = int(count) if len(counts) > 3: # this shouldn't happen print(counts) if 'fig-count' not in counts: counts['fig-count'] = len(self.root.xpath('.//fig')) if 'table-count' not in counts: counts['table-count'] = len(self.root.xpath('.//table-wrap')) return counts @property def word_count(self): """For an article, get how many words are in the body. :return: count of words in the body of the PLOS article """ body_element = self.get_element_xpath(tag_path_elements=["/", "article", "body"]) try: body_text = et.tostring(body_element[0], encoding='unicode', method='text') body_word_count = len(body_text.split(" ")) except IndexError: print("Error parsing article body: {}".format(self.doi)) body_word_count = 0 return body_word_count @filename.setter def filename(self, value): """Sets an article object using a local filename. Converts a filename to DOI using an existing function. :param value: filename :type value: string """ self.doi = filename_to_doi(value) @classmethod def from_filename(cls, filename): """Initiate an article object using a local XML file. Will set `self.directory` if the full file path is available. If not, it will default to `get_corpus_dir()` via `Article().__init__`. This method is most useful for instantiating an Article object when the file is not in the default corpus directory, or when changing directories. """ if os.path.isfile(filename): directory = os.path.dirname(filename) else: directory = None return cls(filename_to_doi(filename), directory=directory)