Source code for tibiapy.parsers.forum

"""Events related to the forum section."""
from __future__ import annotations

import datetime
import re
import urllib.parse
from typing import TYPE_CHECKING, Optional

from tibiapy import InvalidContentError, errors
from tibiapy.builders import (
    CMPostArchiveBuilder,
    ForumAnnouncementBuilder,
    ForumBoardBuilder,
    ForumThreadBuilder,
)
from tibiapy.enums import ThreadStatus, Vocation
from tibiapy.models import (
    AnnouncementEntry,
    BoardEntry,
    CMPost,
    CMPostArchive,
    ForumAnnouncement,
    ForumAuthor,
    ForumBoard,
    ForumEmoticon,
    ForumPost,
    ForumSection,
    ForumThread,
    GuildMembership,
    LastPost,
    ThreadEntry,
)
from tibiapy.utils import (
    clean_text,
    convert_line_breaks,
    get_rows,
    parse_form_data,
    parse_integer,
    parse_link_info,
    parse_pagination,
    parse_tables_map,
    parse_tibia_datetime,
    parse_tibia_forum_datetime,
    parse_tibiacom_content,
    split_list,
    try_enum,
)

if TYPE_CHECKING:
    import bs4

__all__ = (
    "CMPostArchiveParser",
    "ForumAnnouncementParser",
    "ForumBoardParser",
    "ForumSectionParser",
    "ForumThreadParser",
)

timezone_regex = re.compile(r"times are (CES?T)")
filename_regex = re.compile(r"(\w+.gif)")
pages_regex = re.compile(r"\(Pages[^)]+\)")

author_info_regex = re.compile(r"Inhabitant of (\w+)\nVocation: ([\w\s]+)\nLevel: (\d+)")
author_posts_regex = re.compile(r"Posts: (\d+)")
guild_regexp = re.compile(r"([\s\w()]+)\sof the\s(.+)")
guild_title_regexp = re.compile(r"([^(]+)\s\(([^)]+)\)")
post_dates_regex = re.compile(r"(\d{2}\.\d{2}\.\d{4}\s\d{2}:\d{2}:\d{2})")
edited_by_regex = re.compile(r"Edited by (.*) on \d{2}")

signature_separator = "________________"

FORUM_POSITIONS = ["Tutor", "Community Manager", "Customer Support", "Programmer", "Game Content Designer", "Tester"]
"""Special positions displayed for characters in the forums."""


[docs] class CMPostArchiveParser: """Parser for the content of the CM Post Archive page in Tibia.com."""
[docs] @classmethod def from_content(cls, content: str) -> CMPostArchive: """Parse the content of the CM Post Archive page from Tibia.com. Parameters ---------- content: The HTML content of the CM Post Archive in Tibia.com Returns ------- The CM Post archive found in the page. Raises ------ InvalidContent If content is not the HTML content of the CM Post Archive in Tibia.com """ parsed_content = parse_tibiacom_content(content) form = parsed_content.select_one("form") try: (start_month_selector, start_day_selector, start_year_selector, end_month_selector, end_day_selector, end_year_selector) = form.select("select") start_date = cls._get_selected_date(start_month_selector, start_day_selector, start_year_selector) end_date = cls._get_selected_date(end_month_selector, end_day_selector, end_year_selector) except (AttributeError, ValueError) as e: raise errors.InvalidContentError("content does not belong to the CM Post Archive in Tibia.com", e) from e builder = CMPostArchiveBuilder().from_date(start_date).to_date(end_date) table = parsed_content.select_one("table.Table3") if not table: return builder.build() inner_table_container = table.select_one("div.InnerTableContainer") inner_table = inner_table_container.select_one("table") inner_table_rows = get_rows(inner_table) inner_table_rows = [e for e in inner_table_rows if e.parent == inner_table] table_content = inner_table_container.select_one("table.TableContent") header_row, *rows = get_rows(table_content) for row in rows: columns = row.select("td") date_column = columns[0] date = parse_tibia_datetime(clean_text(date_column)) board_thread_column = columns[1] convert_line_breaks(board_thread_column) board, thread = board_thread_column.text.splitlines() link_column = columns[2] post_link_tag = link_column.select_one("a") post_link = parse_link_info(post_link_tag) post_id = int(post_link["query"]["postid"]) builder.add_entry(CMPost(posted_on=date, board=board, thread_title=thread, post_id=post_id)) if not rows: return builder.build() page, total_pages, results = parse_pagination(inner_table_rows[-1]) builder.current_page(page).total_pages(total_pages).results_count(results) return builder.build()
# endregion # region Private Methods @classmethod def _get_selected_date( cls, month_selector: bs4.Tag, day_selector: bs4.Tag, year_selector: bs4.Tag, ) -> Optional[datetime.date]: """Get the date made from the selected options in the selectors. Parameters ---------- month_selector: :class:`bs4.Tag` The month selector. day_selector: :class:`bs4.Tag` The day selector. year_selector: :class:`bs4.Tag` The year selector. Returns ------- :class:`datetime.date` The selected date. """ selected_month = month_selector.select_one("option[selected]") or month_selector.select_one("option") selected_day = day_selector.select_one("option[selected]") or day_selector.select_one("option") selected_year = year_selector.select_one("option[selected]") or year_selector.select_one("option") try: return datetime.date(year=int(selected_year["value"]), month=int(selected_month["value"]), day=int(selected_day["value"])) except ValueError: return None
# endregion
[docs] class ForumSectionParser: """Parser for forum sections, such as world boards, trade boards, etcetera."""
[docs] @classmethod def from_content(cls, content: str) -> ForumSection: """Parse a forum section from Tibia.com. Parameters ---------- content: The HTML content from Tibia.com Returns ------- The forum section found in the page. """ parsed_content = parse_tibiacom_content(content) tables = parse_tables_map(parsed_content) if "Boards" not in tables: raise InvalidContentError("Boards table not found.") rows = tables["Boards"].select("table.TableContent > tr:not(.LabelH)") section_link = parse_link_info(parsed_content.select_one("p.ForumWelcome > a")) redirect = section_link["query"]["redirect"] redirect_qs = urllib.parse.parse_qs(urllib.parse.urlparse(redirect).query) section_id = redirect_qs["sectionid"][0] time_label = parsed_content.select_one("div.CurrentTime") offset = 2 if "CEST" in time_label.text else 1 boards = [board for row in rows if (board := cls._parse_board_row(row, offset)) is not None] return ForumSection(section_id=int(section_id), entries=boards)
@classmethod def _parse_board_row(cls, board_row: bs4.Tag, offset: int = 1) -> Optional[BoardEntry]: """Parse a row containing a board and extracts its information. Parameters ---------- board_row: :class:`bs4.Tag` The row's parsed content. offset: :class:`int` Since the displayed dates do not contain information, it is neccessary to extract the used timezone from somewhere else and pass it to this method to adjust them accordingly. Returns ------- :class:`BoardEntry` The board contained in this row. """ columns = board_row.select("td") # Second Column: Name and description if len(columns) < 5: return None name_column = columns[1] board_link_tag = name_column.select_one("a") description_tag = name_column.select_one("font") description = description_tag.text board_link = parse_link_info(board_link_tag) name = board_link["text"] board_id = int(board_link["query"]["boardid"]) # Third Column: Post count posts_column = columns[2] posts = parse_integer(posts_column.text) # Fourth Column: View count threads_column = columns[3] threads = parse_integer(threads_column.text) # Fifth Column: Last post information last_post_column = columns[4] last_post = LastPostParser._parse_column(last_post_column, offset) return BoardEntry(name=name, board_id=board_id, description=description, posts=posts, threads=threads, last_post=last_post)
[docs] class ForumAnnouncementParser: """Parser for forum announcements posted by CipSoft."""
[docs] @classmethod def from_content(cls, content: str, announcement_id: int = 0) -> Optional[ForumAnnouncement]: """Parse the content of an announcement's page from Tibia.com. Parameters ---------- content: The HTML content of an announcement in Tibia.com announcement_id: The id of the announcement. Since there is no way to obtain the id from the page, the id may be passed to assign. Returns ------- The announcement contained in the page or :obj:`None` if not found. Raises ------ InvalidContent If content is not the HTML content of an announcement page in Tibia.com """ parsed_content = parse_tibiacom_content(content) forum_breadcrumbs = parsed_content.select_one("div.ForumBreadCrumbs") if not forum_breadcrumbs: message_box = parsed_content.select_one("div.TableContainer") if not message_box or "error" not in message_box.text.lower(): raise errors.InvalidContentError("content is not a Tibia.com forum announcement.") return None section_link, board_link, *_ = forum_breadcrumbs.select("a") section_link_info = parse_link_info(section_link) section = section_link_info["text"] section_id = parse_integer(section_link_info["query"]["sectionid"]) board_link_info = parse_link_info(board_link) board = board_link_info["text"] board_id = parse_integer(board_link_info["query"]["boardid"]) builder = (ForumAnnouncementBuilder() .section(section) .section_id(section_id) .board(board) .board_id(board_id) .announcement_id(announcement_id)) times_container = parsed_content.select_one("div.ForumContentFooterLeft") offset = 2 if times_container.text == "CEST" else 1 post_container = parsed_content.select_one("div.ForumPost") character_info_container = post_container.select_one("div.PostCharacterText") builder.author(ForumAuthorParser._parse_author_table(character_info_container)) post_text_container = post_container.select_one("div.PostText") title_tag = post_text_container.select_one("b") builder.title(title_tag.text) dates_container = post_text_container.select_one("font") dates = post_dates_regex.findall(dates_container.text) announcement_content = post_text_container.encode_contents().decode() _, announcement_content = announcement_content.split("<hr/>", 1) builder.content(announcement_content) start_date, end_date = (parse_tibia_forum_datetime(date, offset) for date in dates) builder.from_date(start_date).to_date(end_date) return builder.build()
class ForumAuthorParser: @classmethod def _parse_author_table(cls, character_info_container: bs4.Tag) -> ForumAuthor: """Parse the table containing the author's information. Parameters ---------- character_info_container: :class:`bs4.Tag` The cotnainer with the character's information. Returns ------- :class:`ForumAuthor` The author's information. """ # First link belongs to character char_link = character_info_container.select_one("a") if not char_link: name = character_info_container.text deleted = True traded = False if "(traded)" in name: name = name.replace("(traded)", "").strip() deleted = False traded = True return ForumAuthor(name=name, is_author_deleted=deleted, is_author_traded=traded) author = ForumAuthor(name=char_link.text) char_info = character_info_container.select_one("font.ff_infotext") position_info = character_info_container.select_one("font.ff_smallinfo") # Position and titles are shown the same way. If we have two, the title is first and then the position. # However, if the character only has one of them, there's no way to know which is it unless we validate against # possible types if position_info and (not char_info or position_info.parent != char_info): convert_line_breaks(position_info) titles = [title for title in position_info.text.splitlines() if title] for _title in titles: if _title in FORUM_POSITIONS: author.position = _title else: author.title = _title guild_info = char_info.select_one("font.ff_smallinfo") convert_line_breaks(char_info) char_info_text = char_info.text if info_match := author_info_regex.search(char_info_text): author.world = info_match.group(1) author.vocation = try_enum(Vocation, info_match.group(2)) author.level = int(info_match.group(3)) if guild_info: guild_match = guild_regexp.search(guild_info.text) guild_name = guild_match.group(2) title_match = guild_title_regexp.search(guild_name) title = None if title_match: guild_name = title_match.group(1) title = title_match.group(2) author.guild = GuildMembership(name=guild_name, rank=guild_match.group(1), title=title) author.posts = int(author_posts_regex.search(char_info_text).group(1)) return author
[docs] class ForumBoardParser: """A parser for forum boards from Tibia.com."""
[docs] @classmethod def from_content(cls, content: str) -> Optional[ForumBoard]: """Parse the board's HTML content from Tibia.com. Parameters ---------- content: The HTML content of the board. Returns ------- The forum board contained. Raises ------ InvalidContent` Content is not a board in Tibia.com """ parsed_content = parse_tibiacom_content(content) forum_breadcrumbs = parsed_content.select_one("div.ForumBreadCrumbs") if not forum_breadcrumbs: message_box = parsed_content.select_one("div.InnerTableContainer") if not message_box or "board you requested" not in message_box.text: raise errors.InvalidContentError("content does not belong to a board.") return None tables = parsed_content.select("table.TableContent") header_text = forum_breadcrumbs.text.strip() section, name = split_list(header_text, "|", "|") link_info = parse_link_info(forum_breadcrumbs.select_one("a")) section_id = int(link_info["query"]["sectionid"]) builder = ForumBoardBuilder().name(name).section(section).section_id(section_id) forms = parsed_content.select("form") post_age_form = forms[0] data = parse_form_data(post_age_form) if "threadage" in data.values: builder.age(parse_integer(data.values["threadage"])) else: return builder.build() pagination_block = parsed_content.select_one("small") pages, total, count = parse_pagination(pagination_block) if pagination_block else (0, 0, 0) builder.current_page(pages) builder.total_pages(total) *thread_rows, times_row = get_rows(tables[-1]) for thread_row in thread_rows[1:]: columns = thread_row.select("td") entry = cls._parse_thread_row(columns) if "ClassifiedProposal" in thread_row.attrs.get("class"): entry.golden_frame = True builder.add_entry(entry) if len(tables) > 1: announcement_rows = get_rows(tables[0]) for announcement_row in announcement_rows[1:]: author_link, title_link = announcement_row.select("a") author = author_link.text.strip() announcement_link = parse_link_info(title_link) entry = AnnouncementEntry( title=announcement_link["text"], announcement_id=int(announcement_link["query"]["announcementid"]), announcement_author=author, ) builder.add_announcement(entry) if len(forms) > 2: board_selector_form = forms[2] data = parse_form_data(board_selector_form) builder.board_id(parse_integer(data.values["boardid"])) return builder.build()
# endregion # region Private Methods @classmethod def _parse_thread_row(cls, columns: bs4.ResultSet) -> Optional[ThreadEntry]: """Parse the thread row, containing a single thread. Parameters ---------- columns: :class:`bs4.ResultSet` The list of columns the thread contains. Returns ------- :class:`ThreadEntry` """ # First Column: Thread's status status = None status_column = columns[0] status_img = status_column.select_one("img") status_icon = None if status_img: url = status_img["src"] filename = filename_regex.search(url).group(1) status_icon = url status = ThreadStatus.from_icon(filename) # Second column: Thread's emoticon emoticon = None emoticon_column = columns[1] emoticon_img = emoticon_column.select_one("img") if emoticon_img and emoticon_img.get("alt"): url = emoticon_img["src"] name = emoticon_img["alt"] emoticon = ForumEmoticon(name=name, url=url) # Third Column: Thread's title and number of pages pages = 1 thread_column = columns[2] title = thread_column.text.strip() try: thread_link, *page_links = thread_column.select("a") except ValueError: return None if page_links: last_page_link = page_links[-1] link_info = parse_link_info(last_page_link) pages = int(link_info["query"]["pagenumber"]) title = pages_regex.sub("", title).strip() link_info = parse_link_info(thread_link) thread_id = int(link_info["query"]["threadid"]) # Fourth Column: Thread starter thread_starter_column = columns[3] thread_starter = thread_starter_column.text.strip() # Fifth Column: Number of replies replies_column = columns[4] replies = parse_integer(replies_column.text) # Sixth Column: Number of views views_column = columns[5] views = parse_integer(views_column.text) # Seventh Column: Last post information last_post_column = columns[6] last_post = LastPostParser._parse_column(last_post_column) author_link = columns[3].select_one("a") traded = False if "(traded)" in thread_starter: traded = True thread_starter = thread_starter.replace("(traded)", "").strip() return ThreadEntry( title=title, thread_id=thread_id, thread_starter=thread_starter, replies=replies, views=views, last_post=last_post, emoticon=emoticon, status=status, total_pages=pages, status_icon=status_icon, thread_starter_traded=traded, thread_starter_deleted=author_link is None and not traded, )
# endregion
[docs] class ForumThreadParser: """A parser for forum threads from Tibia.com."""
[docs] @classmethod def from_content(cls, content: str) -> Optional[ForumThread]: """Create an instance of the class from the html content of the thread's page. Parameters ---------- content: The HTML content of the page. Returns ------- The thread contained in the page, or None if the thread doesn't exist Raises ------ InvalidContent If content is not the HTML of a thread's page. """ parsed_content = parse_tibiacom_content(content) forum_breadcrumbs = parsed_content.select_one("div.ForumBreadCrumbs") if not forum_breadcrumbs: message_box = parsed_content.select_one("div.InnerTableContainer") if not message_box or "not found" not in message_box.text: raise errors.InvalidContentError("content does not belong to a thread.") return None header_text = forum_breadcrumbs.text.strip() section_link, board_link = (parse_link_info(t) for t in forum_breadcrumbs.select("a")) section, board, partial_title = split_list(header_text, "|", "|") builder = (ForumThreadBuilder() .section(section) .section_id(int(section_link["query"]["sectionid"])) .board_id(int(board_link["query"]["boardid"])) .board(board)) forum_title_container = parsed_content.select_one("div.ForumTitleText") if not forum_title_container: builder.title(partial_title) return builder.build() builder.title(forum_title_container.text.strip()) border = forum_title_container.parent.previous_sibling.previous_sibling gold_frame = "gold" in border["style"] builder.golden_frame(gold_frame) pagination_block = parsed_content.select_one("td.PageNavigation") pages, total, count = parse_pagination(pagination_block) if pagination_block else (0, 0, 0) builder.current_page(pages) builder.total_pages(total) posts_table = parsed_content.select_one("table.TableContent") thread_info_container = posts_table.select_one("div.ForumPostHeader") thread_info_text_container = thread_info_container.select_one("div.ForumPostHeaderText") thread_number, navigation_container = thread_info_text_container.children builder.thread_id(int(thread_number.split("#")[-1])) navigation_links = navigation_container.select("a") if len(navigation_links) == 2: prev_link_tag, next_link_tag = navigation_links prev_link = parse_link_info(prev_link_tag) builder.previous_topic_number(int(prev_link["query"]["threadid"])) next_link = parse_link_info(next_link_tag) builder.next_topic_number(int(next_link["query"]["threadid"])) elif "Previous" in navigation_links[0].text: prev_link = parse_link_info(navigation_links[0]) builder.previous_topic_number(int(prev_link["query"]["threadid"])) else: next_link = parse_link_info(navigation_links[0]) builder.next_topic_number(int(next_link["query"]["threadid"])) times_container = posts_table.select_one("div.ForumContentFooterLeft") offset = 2 if times_container.text == "CEST" else 1 post_containers = posts_table.select("div.PostBody") for post_container in post_containers: post = cls._parse_post_table(post_container, offset) builder.add_entry(post) return builder.build()
# endregion # region Private Methods @classmethod def _parse_post_table(cls, post_table: bs4.Tag, offset: int = 1) -> ForumPost: """Parse the table containing a single posts, extracting its information. Parameters ---------- post_table: :class:`bs4.Tag` The parsed HTML content of the table. offset: :class:`int` The UTC offset used for the timestamps. Since the timestamps found in the post contain no timezone information, the offset is extracted from another section and passed here to adjust them accordingly. Returns ------- :class:`ForumPost` The post contained in the table. """ golden_frame = "CipPostWithBorderImage" in post_table.parent.attrs.get("class") character_info_container = post_table.select_one("div.PostCharacterText") post_author = ForumAuthorParser._parse_author_table(character_info_container) content_container = post_table.select_one("div.PostText") emoticon = None title_tag = None # The first elements are the emoticon, the title, and the separator. while True: child = next(content_container.children) child.extract() if child.name == "img": emoticon = ForumEmoticon(name=child["alt"], url=child["src"]) elif child.name == "b": title_tag = child elif child.name == "div": break # Remove the first line jump found in post content first_break = content_container.select_one("br") if first_break: first_break.extract() title = None signature = None signature_container = post_table.select_one("td.ff_pagetext") if signature_container: # Remove the signature's content from content container signature_container.extract() signature = signature_container.encode_contents().decode() content = content_container.encode_contents().decode() if signature_container: # The signature separator will still be part of the content container, so we remove it parts = content.split(signature_separator) # This will handle the post containing another signature separator within the content # We join back all the pieces except for the last one content = signature_separator.join(parts[:-1]) if title_tag: title = title_tag.text post_details = post_table.select_one("div.PostDetails") dates = post_dates_regex.findall(post_details.text) edited_date = None edited_by = None posted_date = parse_tibia_forum_datetime(dates[0], offset) if len(dates) > 1: edited_date = parse_tibia_forum_datetime(dates[1], offset) edited_by = edited_by_regex.search(post_details.text).group(1) post_details = post_table.select_one("div.AdditionalBox") post_number = post_details.text.replace("Post #", "") post_id = int(post_number) return ForumPost(author=post_author, content=content, signature=signature, posted_date=posted_date, edited_date=edited_date, edited_by=edited_by, post_id=post_id, title=title, emoticon=emoticon, golden_frame=golden_frame)
# endregion class LastPostParser: @classmethod def _parse_column(cls, last_post_column: bs4.Tag, offset: int = 1) -> Optional[LastPost]: """Parse the column containing the last post information and extracts its data. Parameters ---------- last_post_column: :class:`bs4.Tag`: The column containing the last post. offset: :class:`int` Since the timestamps have no offset information, it may be passed to fill it out. Returns ------- Optional[:class:`LastPost`]: The last post described in the column, if any. """ last_post_info = last_post_column.select_one("div.LastPostInfo, span.LastPostInfo") if last_post_info is None: return None permalink_tag = last_post_info.select_one("a") permalink_info = parse_link_info(permalink_tag) post_id = int(permalink_info["query"]["postid"]) date_text = clean_text(last_post_info) last_post_date = parse_tibia_forum_datetime(date_text, offset) last_post_author_tag = last_post_column.select_one("font") author_link = last_post_author_tag.select_one("a") deleted = author_link is None author = clean_text(last_post_author_tag).replace("by", "", 1) traded = False if "(traded)" in author: author = author.replace("(traded)", "").strip() traded = True deleted = False return LastPost(author=author, post_id=post_id, posted_on=last_post_date, is_author_deleted=deleted, is_author_traded=traded)