Python bs4.Tag() Examples
The following are 30
code examples of bs4.Tag().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4
, or try the search function
.
Example #1
Source File: parse_equation.py From scholar-reader with Apache License 2.0 | 6 votes |
def _extract_tokens(element: Tag) -> List[Token]: """ Get the tokens defined in this element. Tokens are only found in low-level elements like "<mi>" and "<mn>". This function will find no tokens in higher-level nodes that solely group other low-level elements (like "<mrow>" and "<msub>"). """ tokens = [] if element.name in TOKEN_TAGS and _has_s2_token_annotations(element): tokens.append( Token( text=element.string, token_index=int(element["s2:index"]), start=int(element["s2:start"]), end=int(element["s2:end"]), ) ) return tokens
Example #2
Source File: match_symbols.py From scholar-reader with Apache License 2.0 | 6 votes |
def _find_base_element(element: Tag) -> Optional[Tag]: """ Find the 'base element' of a symbol. In most cases, this is the base identifier that is being modified by other symbols. For example, this function will return the '<mi>' element for 'x' in the symbol 'x^2', or 'x_i'. If this element does not have any descendant that can qualify as a base element, None is returned. """ BASE_ELEMENT_TAG = "mi" # To find the base element perform a depth-first search. The first identifier ('<mi>') in a # pre-order traversal of the tree is the base element. This is because the 'base' element # is the first child of '<msub>' or '<msup>' elements. if element.name == BASE_ELEMENT_TAG: return element for child in element.children: if isinstance(child, Tag): base_element = _find_base_element(child) if base_element is not None: return base_element return None
Example #3
Source File: gus.py From gusregon with BSD 2-Clause "Simplified" License | 6 votes |
def get_pkd(self, *args, **kwargs): pkd = [] details = self._get_details(*args, **kwargs) if details is not None: data = BeautifulSoup(details, 'lxml') report_type = self.pkd_report_type.get('F') if 'P' in data.typ.get_text(): report_type = self.pkd_report_type.get('P') report = self._service( 'DanePobierzPelnyRaport', data.regon.get_text(), report_type) if report is not None: for item in BeautifulSoup(report, 'lxml').find_all('dane'): data = {i.name.split('_', 1)[1].replace('_', '').lower(): i.get_text() for i in item.children if isinstance(i, Tag)} pkd.append({ 'code': data['pkdkod'], 'name': data['pkdnazwa'], 'main': data['pkdprzewazajace'] == '1'}) pkd = [dict(t) for t in set([tuple(d.items()) for d in pkd])] return pkd
Example #4
Source File: html.py From PyBloqs with GNU Lesser General Public License v2.1 | 6 votes |
def construct_element(container=None, content=None, tag=None, element_type=None): """ Constructs an element and appends it to the container. :param container: Container to add the element to. :param content: String representation of content (e.g. JS or CSS) :param tag: Tag name, e.g. "script" or "style" :param element_type: E.g. "text/javascript" or "text/css" :return: New element. """ if container is None: el = root(tag, type=element_type) else: el = append_to(container, tag, type=element_type) if content is not None: el.string = content return el
Example #5
Source File: UtilBot.py From HangoutsBot with GNU General Public License v3.0 | 6 votes |
def define(word, num=1): if num < 1: num = 1 try: url = "http://wordnetweb.princeton.edu/perl/webwn?s=" + word + "&sub=Search+WordNet&o2=&o0=&o8=1&o1=1&o7=&o5=&o9=&o6=&o3=&o4=&h=0000000000" except Exception as e: print(e) return 'Couldn\'t download definition.' try: soup = BeautifulSoup(request.urlopen(url)) except: return "Network Error: Couldn't download definition.", 0 if soup.ul is not None: definitions = [x.text for x in list(soup.ul) if isinstance(x, Tag) and x.text != '\n' and x.text != ''] if len(definitions) >= num: return (definitions[num - 1] + '[' + str(num) + ' of ' + str(len(definitions)) + ']')[ 3:].capitalize(), len(definitions) return "Couldn\'t find definition.", 0
Example #6
Source File: article_data.py From flask-react-spa with MIT License | 6 votes |
def html(self): html = markdown.markdown(self.markdown, extensions=MARKDOWN_EXTENSIONS, output_format='html5') # fix image links soup = BeautifulSoup(html, 'lxml') for img in soup.find_all('img'): img.attrs['src'] = self._get_static_url(img.attrs['src']) # strip html and body tags body = soup.find('body') or '' if isinstance(body, SoupTag): body = ''.join(map(str, body.contents)) # prefix stylesheet if necessary if not self.is_dir or not os.path.exists( os.path.join(self.dir_path, ARTICLE_STYLESHEET_FILENAME)): return body href = self._get_static_url(ARTICLE_STYLESHEET_FILENAME) return f'<link rel="stylesheet" type="text/css" href="{href}">' + body
Example #7
Source File: fns.py From foxford_courses with MIT License | 6 votes |
def retrieve_erly_iframe_src(self, video_source_response: CachedResponse) -> str: erly_iframe: Union[Tag, None] = pipe( lambda r_content: BeautifulSoup( r_content, "html.parser" ), lambda soup: soup.select_one( "div.full_screen > iframe" ) )(video_source_response.content) if not erly_iframe: return {"fatal_error": ".full_screen > iframe wasn't found"} erly_iframe_src: Union[str, None] = erly_iframe.get("src") if not erly_iframe_src: return {"fatal_error": ".full_screen > iframe doesn't have src attribute"} return erly_iframe_src
Example #8
Source File: html.py From PyBloqs with GNU Lesser General Public License v2.1 | 6 votes |
def append_to(parent, tag, **kwargs): """ Append an element to the supplied parent. :param parent: Parent to append to. :param tag: Tag to create. :param kwargs: Tag kwargs. :return: New element. """ if hasattr(parent, "soup"): soup = parent.soup else: soup = parent.find_parent("html") # Create Tag explicitly instead of using new_tag, otherwise attribute "name" leads to clash with tag-name in bs4 new_tag = bs4.Tag(builder=soup.builder, name=tag, attrs=kwargs) new_tag.soup = soup parent.append(new_tag) return new_tag
Example #9
Source File: extractor.py From html-table-extractor with MIT License | 6 votes |
def __init__(self, input, id_=None, **kwargs): # TODO: should divide this class into two subclasses # to deal with string and bs4.Tag separately # validate the input if not isinstance(input, str) and not isinstance(input, Tag): raise Exception('Unrecognized type. Valid input: str, bs4.element.Tag') soup = BeautifulSoup(input, 'html.parser').find() if isinstance(input, str) else input # locate the target table if soup.name == 'table': self._table = soup else: self._table = soup.find(id=id_) if 'transformer' in kwargs: self._transformer = kwargs['transformer'] else: self._transformer = str self._output = []
Example #10
Source File: parse.py From wiki-table-scrape with MIT License | 5 votes |
def clean_cell(cell): """Yield clean string value from a bs4.Tag from Wikipedia.""" to_remove = ( # Tooltip references with mouse-over effects {"name": "sup", "class": "reference"}, # Keys for special sorting effects on the table {"name": "sup", "class": "sortkey"}, # Wikipedia `[edit]` buttons {"name": "span", "class": "mw-editsection"}, ) # Remove extra tags not essential to the table for definition in to_remove: for tag in cell.findAll(**definition): tag.extract() # Replace line breaks with spaces linebreaks = cell.findAll("br") if linebreaks: for linebreak in linebreaks: linebreak.replace_with(new_span(" ")) # If cell is only a single image, use its alt-text tags = cell.findAll() if len(tags) == 1 and tags[0].name == "img": return spaces_only(tags[0]["alt"]) # Reduce remaining cell to text, minus footnotes and other bracketed sections tags = [tag for tag in cell.findAll(text=True) if not tag.startswith("[")] return spaces_only("".join(tags))
Example #11
Source File: test_parse_equation.py From scholar-reader with Apache License 2.0 | 5 votes |
def load_fragment_tag(filename: str) -> Tag: " Read a MathML fragment from file and return a BeautifulSoup tag for it. " with open(get_test_path(os.path.join("mathml-fragments", filename))) as file_: mathml = file_.read() # 'body.next' is used as the parser adds in 'html' and 'body' tags; this return just the child # node of the body (the original node we were parsing) return BeautifulSoup(mathml, "lxml").body.next
Example #12
Source File: css_match.py From bazarr with GNU General Public License v3.0 | 5 votes |
def is_tag(obj): """Is tag.""" import bs4 return isinstance(obj, bs4.Tag)
Example #13
Source File: css_match.py From bazarr with GNU General Public License v3.0 | 5 votes |
def assert_valid_input(cls, tag): """Check if valid input tag or document.""" # Fail on unexpected types. if not cls.is_tag(tag): raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))
Example #14
Source File: parse.py From wiki-table-scrape with MIT License | 5 votes |
def new_span(text): """Return a new bs4.Tag <span> element with the given value.""" return bs4.BeautifulSoup(f"<span>{text}</span>", "lxml").html.body.span
Example #15
Source File: parse_equation.py From scholar-reader with Apache License 2.0 | 5 votes |
def create_element(tag_name: str) -> Tag: " Create a BeautifulSoup tag with the given tag_name. " # A dummy BeautifulSoup object is created to access to the 'new_tag' function. return BeautifulSoup("", "lxml").new_tag(tag_name)
Example #16
Source File: oxford_learning.py From FastWordQuery with GNU General Public License v3.0 | 5 votes |
def _clean(self, tg): """ :type tg:Tag :return: """ if not tg: return tg decompose_cls = ['xr-gs', 'sound', 'heading', 'topic', 'collapse', 'oxford3000'] if tg.attrs and 'class' in tg.attrs: for _cls in decompose_cls: _tgs = tg.find_all(attrs=self._cls_dic(_cls), recursive=True) for _tg in _tgs: _tg.decompose() rmv_attrs = ['dpsid', 'id', 'psg', 'reg'] try: tg.attrs = {key: value for key, value in tg.attrs.items() if key not in rmv_attrs} except ValueError: pass for child in tg.children: if not isinstance(child, Tag): continue self._clean(child) return tg
Example #17
Source File: oxford_learning.py From FastWordQuery with GNU General Public License v3.0 | 5 votes |
def _pull_ame_phon(self): try: _tag_phn = self.tag_phon_nam.find('span', self._cls_dic('phon')).get_text().replace('/', '').replace('NAmE', '') phon = '/{}/'.format(_tag_phn.text if isinstance(_tag_phn, Tag) else _tag_phn) except: phon = '' return phon
Example #18
Source File: oxford_learning.py From FastWordQuery with GNU General Public License v3.0 | 5 votes |
def _pull_bre_phon(self): try: _tag_phn = self.tag_phon_bre.find('span', self._cls_dic('phon')).get_text().replace('/', '').replace('BrE', '') phon = '/{}/'.format(_tag_phn.text if isinstance(_tag_phn, Tag) else _tag_phn) except: phon = '' return phon
Example #19
Source File: oxford_learning.py From FastWordQuery with GNU General Public License v3.0 | 5 votes |
def tag_phon_nam(self): """ :rtype: Tag """ return self.tag_pron.find('span', self._cls_dic('pron-g'), geo='n_am') # ---- Explains
Example #20
Source File: match_symbols.py From scholar-reader with Apache License 2.0 | 5 votes |
def _create_soup_element(mathml: str) -> Optional[Tag]: try: soup = BeautifulSoup(mathml, "lxml") except AttributeError as e: logging.warning("BeautifulSoup could not parse MathML: '%s', %s", mathml, e) return None return soup.body.next if soup.body else soup
Example #21
Source File: parse_equation.py From scholar-reader with Apache License 2.0 | 5 votes |
def _is_error_element(element: Tag) -> bool: " Detect whether a BeautifulSoup tag represents a KaTeX parse error. " return (element.name == "mstyle") and bool( element.attrs.get("mathcolor") == KATEX_ERROR_COLOR )
Example #22
Source File: oxford_learning.py From FastWordQuery with GNU General Public License v3.0 | 5 votes |
def tag_img(self): """ :rtype: Tag """ return self.bs.find('a', self._cls_dic('topic'))
Example #23
Source File: parse_equation.py From scholar-reader with Apache License 2.0 | 5 votes |
def merge_mathml_elements(elements: List[Tag]) -> List[Tag]: merger = MathMlElementMerger() return merger.merge(elements)
Example #24
Source File: parse_equation.py From scholar-reader with Apache License 2.0 | 5 votes |
def merge(self, elements: List[Tag]) -> List[Tag]: """ Merge consecutive elements in a list of elements. Do not modify the input list of elements, rather return a new list of elements. """ self.merged: List[Tag] = [] # pylint: disable=attribute-defined-outside-init self.to_merge: List[Tag] = [] # pylint: disable=attribute-defined-outside-init # Main loop: iterate over elements, merging when possible. for e in elements: # Skip over whitespace. if isinstance(e, str) and e.isspace(): continue # If an element is a mergeable type of element... if self._is_mergeable_type(e): # Merge with prior elements if you can. Otherwise, merge the prior elements, now that # we know there are no more elements to merge with them. if not self._can_merge_with_prior_elements(e): self._merge_prior_elements() self.to_merge.append(e) # When an element can't be merged, merge all prior elements, and add this element # to the list of elements without changing it. else: self._merge_prior_elements() self.merged.append(e) # If there elements still waiting to be merged, merge them. if len(self.to_merge) > 0: self._merge_prior_elements() return self.merged
Example #25
Source File: parse_equation.py From scholar-reader with Apache License 2.0 | 5 votes |
def _is_mergeable_type(self, element: Tag) -> bool: " Determine if a element is a type that is mergeable with other elements. " return element.name in MERGEABLE_TOKEN_TAGS and _has_s2_token_annotations(element)
Example #26
Source File: parse_equation.py From scholar-reader with Apache License 2.0 | 5 votes |
def _can_merge_with_prior_elements(self, element: Tag) -> bool: """ Determine whether an element can be merged into the list of prior elements. It is assumed that you have already called _is_mergeable_type on the element to check if it can be merged before calling this method. """ # If there are no element to merge with, then the element will merge with an empty list. if len(self.to_merge) == 0: return True # For two elements to be merged together, one must follow the other without spaces. last_element = self.to_merge[-1] element_start = element.attrs["s2:start"] last_element_end = last_element.attrs["s2:end"] if not element_start == last_element_end: return False # Here come the context-sensitive rules: # 1. Letters can be merged into any sequence of elements before them that starts with a # a letter. This allows tokens to be merged into (target letter is shown in # <angled brackets> identifiers like "r2<d>2", but not constant multiplications like # "4<x>", which should be split into two symbols. if element.name == "mi": return bool(self.to_merge[0].name == "mi") # 2. Numbers can be merged into letters before them, adding to the identifier. # 3. Numbers can be merged into numbers before them, extending an identifier, or making # a number with multiple digits. if element.name == "mn": return True return False
Example #27
Source File: css_match.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def assert_valid_input(cls, tag): """Check if valid input tag or document.""" # Fail on unexpected types. if not cls.is_tag(tag): raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))
Example #28
Source File: css_match.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def is_tag(obj): """Is tag.""" import bs4 return isinstance(obj, bs4.Tag)
Example #29
Source File: gus.py From gusregon with BSD 2-Clause "Simplified" License | 5 votes |
def _remove_prefix(self, data): data = {item.name: item.get_text() for item in BeautifulSoup(data, 'lxml').dane if isinstance(item, Tag)} parsed_data = {} for name, value in data.items(): parsed_data[name.split('_', 1)[1]] = value.strip() return parsed_data
Example #30
Source File: css_match.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def assert_valid_input(cls, tag): """Check if valid input tag or document.""" # Fail on unexpected types. if not cls.is_tag(tag): raise TypeError("Expected a BeautifulSoup 'Tag', but instead recieved type {}".format(type(tag)))