Python bs4.Tag() Examples

The following are 30 code examples of bs4.Tag(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bs4 , or try the search function .
Example #1
Source File: parse_equation.py    From scholar-reader with Apache License 2.0 6 votes vote down vote up
def _extract_tokens(element: Tag) -> List[Token]:
    """
    Get the tokens defined in this element. Tokens are only found in low-level elements like
    "<mi>" and "<mn>". This function will find no tokens in higher-level nodes that solely
    group other low-level elements (like "<mrow>" and "<msub>").
    """

    tokens = []
    if element.name in TOKEN_TAGS and _has_s2_token_annotations(element):
        tokens.append(
            Token(
                text=element.string,
                token_index=int(element["s2:index"]),
                start=int(element["s2:start"]),
                end=int(element["s2:end"]),
            )
        )

    return tokens 
Example #2
Source File: match_symbols.py    From scholar-reader with Apache License 2.0 6 votes vote down vote up
def _find_base_element(element: Tag) -> Optional[Tag]:
    """
    Find the 'base element' of a symbol. In most cases, this is the base identifier that
    is being modified by other symbols. For example, this function will return the
    '<mi>' element for 'x' in the symbol 'x^2', or 'x_i'. If this element does not have any
    descendant that can qualify as a base element, None is returned.
    """

    BASE_ELEMENT_TAG = "mi"

    # To find the base element perform a depth-first search. The first identifier ('<mi>') in a
    # pre-order traversal of the tree is the base element. This is because the 'base' element
    # is the first child of '<msub>' or '<msup>' elements.
    if element.name == BASE_ELEMENT_TAG:
        return element
    for child in element.children:
        if isinstance(child, Tag):
            base_element = _find_base_element(child)
            if base_element is not None:
                return base_element

    return None 
Example #3
Source File: gus.py    From gusregon with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_pkd(self, *args, **kwargs):
        pkd = []
        details = self._get_details(*args, **kwargs)
        if details is not None:
            data = BeautifulSoup(details, 'lxml')
            report_type = self.pkd_report_type.get('F')
            if 'P' in data.typ.get_text():
                report_type = self.pkd_report_type.get('P')
            report = self._service(
                'DanePobierzPelnyRaport', data.regon.get_text(), report_type)
            if report is not None:
                for item in BeautifulSoup(report, 'lxml').find_all('dane'):
                    data = {i.name.split('_', 1)[1].replace('_', '').lower(): i.get_text()
                            for i in item.children if isinstance(i, Tag)}
                    pkd.append({
                        'code': data['pkdkod'],
                        'name': data['pkdnazwa'],
                        'main': data['pkdprzewazajace'] == '1'})
                pkd = [dict(t) for t in set([tuple(d.items()) for d in pkd])]
        return pkd 
Example #4
Source File: html.py    From PyBloqs with GNU Lesser General Public License v2.1 6 votes vote down vote up
def construct_element(container=None, content=None, tag=None, element_type=None):
    """
    Constructs an element and appends it to the container.

    :param container: Container to add the element to.
    :param content: String representation of content (e.g. JS or CSS)
    :param tag: Tag name, e.g. "script" or "style"
    :param element_type: E.g. "text/javascript" or "text/css"
    :return: New element.
    """
    if container is None:
        el = root(tag, type=element_type)
    else:
        el = append_to(container, tag, type=element_type)
    if content is not None:
        el.string = content
    return el 
Example #5
Source File: UtilBot.py    From HangoutsBot with GNU General Public License v3.0 6 votes vote down vote up
def define(word, num=1):
    if num < 1:
        num = 1
    try:
        url = "http://wordnetweb.princeton.edu/perl/webwn?s=" + word + "&sub=Search+WordNet&o2=&o0=&o8=1&o1=1&o7=&o5=&o9=&o6=&o3=&o4=&h=0000000000"
    except Exception as e:
        print(e)
        return 'Couldn\'t download definition.'
    try:
        soup = BeautifulSoup(request.urlopen(url))
    except:
        return "Network Error: Couldn't download definition.", 0
    if soup.ul is not None:
        definitions = [x.text for x in list(soup.ul) if isinstance(x, Tag) and x.text != '\n' and x.text != '']
        if len(definitions) >= num:
            return (definitions[num - 1] + '[' + str(num) + ' of ' + str(len(definitions)) + ']')[
                   3:].capitalize(), len(definitions)
    return "Couldn\'t find definition.", 0 
Example #6
Source File: article_data.py    From flask-react-spa with MIT License 6 votes vote down vote up
def html(self):
        html = markdown.markdown(self.markdown,
                                 extensions=MARKDOWN_EXTENSIONS,
                                 output_format='html5')

        # fix image links
        soup = BeautifulSoup(html, 'lxml')
        for img in soup.find_all('img'):
            img.attrs['src'] = self._get_static_url(img.attrs['src'])

        # strip html and body tags
        body = soup.find('body') or ''
        if isinstance(body, SoupTag):
            body = ''.join(map(str, body.contents))

        # prefix stylesheet if necessary
        if not self.is_dir or not os.path.exists(
                os.path.join(self.dir_path, ARTICLE_STYLESHEET_FILENAME)):
            return body

        href = self._get_static_url(ARTICLE_STYLESHEET_FILENAME)
        return f'<link rel="stylesheet" type="text/css" href="{href}">' + body 
Example #7
Source File: fns.py    From foxford_courses with MIT License 6 votes vote down vote up
def retrieve_erly_iframe_src(self, video_source_response: CachedResponse) -> str:
        erly_iframe: Union[Tag, None] = pipe(
            lambda r_content: BeautifulSoup(
                r_content,
                "html.parser"
            ),
            lambda soup: soup.select_one(
                "div.full_screen > iframe"
            )
        )(video_source_response.content)

        if not erly_iframe:
            return {"fatal_error": ".full_screen > iframe wasn't found"}

        erly_iframe_src: Union[str, None] = erly_iframe.get("src")

        if not erly_iframe_src:
            return {"fatal_error": ".full_screen > iframe doesn't have src attribute"}

        return erly_iframe_src 
Example #8
Source File: html.py    From PyBloqs with GNU Lesser General Public License v2.1 6 votes vote down vote up
def append_to(parent, tag, **kwargs):
    """
    Append an element to the supplied parent.

    :param parent: Parent to append to.
    :param tag: Tag to create.
    :param kwargs: Tag kwargs.
    :return: New element.
    """
    if hasattr(parent, "soup"):
        soup = parent.soup
    else:
        soup = parent.find_parent("html")

    # Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4
    new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs)

    new_tag.soup = soup

    parent.append(new_tag)

    return new_tag 
Example #9
Source File: extractor.py    From html-table-extractor with MIT License 6 votes vote down vote up
def __init__(self, input, id_=None, **kwargs):
        # TODO: should divide this class into two subclasses
        # to deal with string and bs4.Tag separately

        # validate the input
        if not isinstance(input, str) and not isinstance(input, Tag):
            raise Exception('Unrecognized type. Valid input: str, bs4.element.Tag')

        soup = BeautifulSoup(input, 'html.parser').find() if isinstance(input, str) else input

        # locate the target table
        if soup.name == 'table':
            self._table = soup
        else:
            self._table = soup.find(id=id_)

        if 'transformer' in kwargs:
            self._transformer = kwargs['transformer']
        else:
            self._transformer = str

        self._output = [] 
Example #10
Source File: parse.py    From wiki-table-scrape with MIT License 5 votes vote down vote up
def clean_cell(cell):
    """Yield clean string value from a bs4.Tag from Wikipedia."""

    to_remove = (
        # Tooltip references with mouse-over effects
        {"name": "sup", "class": "reference"},
        # Keys for special sorting effects on the table
        {"name": "sup", "class": "sortkey"},
        # Wikipedia `[edit]` buttons
        {"name": "span", "class": "mw-editsection"},
    )

    # Remove extra tags not essential to the table
    for definition in to_remove:
        for tag in cell.findAll(**definition):
            tag.extract()

    # Replace line breaks with spaces
    linebreaks = cell.findAll("br")
    if linebreaks:
        for linebreak in linebreaks:
            linebreak.replace_with(new_span(" "))

    # If cell is only a single image, use its alt-text
    tags = cell.findAll()
    if len(tags) == 1 and tags[0].name == "img":
        return spaces_only(tags[0]["alt"])

    # Reduce remaining cell to text, minus footnotes and other bracketed sections
    tags = [tag for tag in cell.findAll(text=True) if not tag.startswith("[")]
    return spaces_only("".join(tags)) 
Example #11
Source File: test_parse_equation.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def load_fragment_tag(filename: str) -> Tag:
    " Read a MathML fragment from file and return a BeautifulSoup tag for it. "

    with open(get_test_path(os.path.join("mathml-fragments", filename))) as file_:
        mathml = file_.read()

        # 'body.next' is used as the parser adds in 'html' and 'body' tags; this return just the child
        # node of the body (the original node we were parsing)
        return BeautifulSoup(mathml, "lxml").body.next 
Example #12
Source File: css_match.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def is_tag(obj):
        """Is tag."""

        import bs4
        return isinstance(obj, bs4.Tag) 
Example #13
Source File: css_match.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def assert_valid_input(cls, tag):
        """Check if valid input tag or document."""

        # Fail on unexpected types.
        if not cls.is_tag(tag):
            raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) 
Example #14
Source File: parse.py    From wiki-table-scrape with MIT License 5 votes vote down vote up
def new_span(text):
    """Return a new bs4.Tag <span> element with the given value."""
    return bs4.BeautifulSoup(f"<span>{text}</span>", "lxml").html.body.span 
Example #15
Source File: parse_equation.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def create_element(tag_name: str) -> Tag:
    " Create a BeautifulSoup tag with the given tag_name. "

    # A dummy BeautifulSoup object is created to access to the 'new_tag' function.
    return BeautifulSoup("", "lxml").new_tag(tag_name) 
Example #16
Source File: oxford_learning.py    From FastWordQuery with GNU General Public License v3.0 5 votes vote down vote up
def _clean(self, tg):
        """

        :type tg:Tag
        :return:
        """
        if not tg:
            return tg
        decompose_cls = ['xr-gs', 'sound', 'heading', 'topic', 'collapse', 'oxford3000']

        if tg.attrs and 'class' in tg.attrs:
            for _cls in decompose_cls:
                _tgs = tg.find_all(attrs=self._cls_dic(_cls), recursive=True)
                for _tg in _tgs:
                    _tg.decompose()

        rmv_attrs = ['dpsid', 'id', 'psg', 'reg']
        try:
            tg.attrs = {key: value for key, value in tg.attrs.items()
                        if key not in rmv_attrs}
        except ValueError:
            pass
        for child in tg.children:
            if not isinstance(child, Tag):
                continue
            self._clean(child)
        return tg 
Example #17
Source File: oxford_learning.py    From FastWordQuery with GNU General Public License v3.0 5 votes vote down vote up
def _pull_ame_phon(self):
        try:
            _tag_phn = self.tag_phon_nam.find('span', self._cls_dic('phon')).get_text().replace('/', '').replace('NAmE', '')
            phon = '/{}/'.format(_tag_phn.text if isinstance(_tag_phn, Tag) else _tag_phn)
        except:
            phon = ''
        return phon 
Example #18
Source File: oxford_learning.py    From FastWordQuery with GNU General Public License v3.0 5 votes vote down vote up
def _pull_bre_phon(self):
        try:
            _tag_phn = self.tag_phon_bre.find('span', self._cls_dic('phon')).get_text().replace('/', '').replace('BrE', '')
            phon = '/{}/'.format(_tag_phn.text if isinstance(_tag_phn, Tag) else _tag_phn)
        except:
            phon = ''
        return phon 
Example #19
Source File: oxford_learning.py    From FastWordQuery with GNU General Public License v3.0 5 votes vote down vote up
def tag_phon_nam(self):
        """

        :rtype: Tag
        """
        return self.tag_pron.find('span', self._cls_dic('pron-g'), geo='n_am')

    # ---- Explains 
Example #20
Source File: match_symbols.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def _create_soup_element(mathml: str) -> Optional[Tag]:
    try:
        soup = BeautifulSoup(mathml, "lxml")
    except AttributeError as e:
        logging.warning("BeautifulSoup could not parse MathML: '%s', %s", mathml, e)
        return None
    return soup.body.next if soup.body else soup 
Example #21
Source File: parse_equation.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def _is_error_element(element: Tag) -> bool:
    " Detect whether a BeautifulSoup tag represents a KaTeX parse error. "

    return (element.name == "mstyle") and bool(
        element.attrs.get("mathcolor") == KATEX_ERROR_COLOR
    ) 
Example #22
Source File: oxford_learning.py    From FastWordQuery with GNU General Public License v3.0 5 votes vote down vote up
def tag_img(self):
        """

        :rtype: Tag
        """
        return self.bs.find('a', self._cls_dic('topic')) 
Example #23
Source File: parse_equation.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def merge_mathml_elements(elements: List[Tag]) -> List[Tag]:
    merger = MathMlElementMerger()
    return merger.merge(elements) 
Example #24
Source File: parse_equation.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def merge(self, elements: List[Tag]) -> List[Tag]:
        """
        Merge consecutive  elements in a list of elements. Do not modify the input list of elements, rather
        return a new list of elements.
        """
        self.merged: List[Tag] = []  # pylint: disable=attribute-defined-outside-init
        self.to_merge: List[Tag] = []  # pylint: disable=attribute-defined-outside-init

        # Main loop: iterate over elements, merging when possible.
        for e in elements:
            # Skip over whitespace.
            if isinstance(e, str) and e.isspace():
                continue
            # If an element is a mergeable type of element...
            if self._is_mergeable_type(e):
                # Merge with prior elements if you can. Otherwise, merge the prior elements, now that
                # we know there are no more elements to merge with them.
                if not self._can_merge_with_prior_elements(e):
                    self._merge_prior_elements()
                self.to_merge.append(e)
            # When an element can't be merged, merge all prior elements, and add this element
            # to the list of elements without changing it.
            else:
                self._merge_prior_elements()
                self.merged.append(e)

        # If there elements still waiting to be merged, merge them.
        if len(self.to_merge) > 0:
            self._merge_prior_elements()

        return self.merged 
Example #25
Source File: parse_equation.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def _is_mergeable_type(self, element: Tag) -> bool:
        " Determine if a element is a type that is mergeable with other elements. "
        return element.name in MERGEABLE_TOKEN_TAGS and _has_s2_token_annotations(element) 
Example #26
Source File: parse_equation.py    From scholar-reader with Apache License 2.0 5 votes vote down vote up
def _can_merge_with_prior_elements(self, element: Tag) -> bool:
        """
        Determine whether an element can be merged into the list of prior elements. It is
        assumed that you have already called _is_mergeable_type on the element to check if it
        can be merged before calling this method.
        """

        # If there are no element to merge with, then the element will merge with an empty list.
        if len(self.to_merge) == 0:
            return True

        # For two elements to be merged together, one must follow the other without spaces.
        last_element = self.to_merge[-1]
        element_start = element.attrs["s2:start"]
        last_element_end = last_element.attrs["s2:end"]
        if not element_start == last_element_end:
            return False

        # Here come the context-sensitive rules:
        # 1. Letters can be merged into any sequence of elements before them that starts with a
        #    a letter. This allows tokens to be merged into (target letter is shown in
        #    <angled brackets> identifiers like "r2<d>2", but not constant multiplications like
        #   "4<x>", which should be split into two symbols.
        if element.name == "mi":
            return bool(self.to_merge[0].name == "mi")
        # 2. Numbers can be merged into letters before them, adding to the identifier.
        # 3. Numbers can be merged into numbers before them, extending an identifier, or making
        #    a number with multiple digits.
        if element.name == "mn":
            return True

        return False 
Example #27
Source File: css_match.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def assert_valid_input(cls, tag):
        """Check if valid input tag or document."""

        # Fail on unexpected types.
        if not cls.is_tag(tag):
            raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag))) 
Example #28
Source File: css_match.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def is_tag(obj):
        """Is tag."""

        import bs4
        return isinstance(obj, bs4.Tag) 
Example #29
Source File: gus.py    From gusregon with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _remove_prefix(self, data):
        data = {item.name: item.get_text()
                for item in BeautifulSoup(data, 'lxml').dane if isinstance(item, Tag)}
        parsed_data = {}
        for name, value in data.items():
            parsed_data[name.split('_', 1)[1]] = value.strip()
        return parsed_data 
Example #30
Source File: css_match.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def assert_valid_input(cls, tag):
        """Check if valid input tag or document."""

        # Fail on unexpected types.
        if not cls.is_tag(tag):
            raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))