Source code for tibiapy.parsers.forum

"""Events related to the forum section."""
from __future__ import annotations

import datetime
import re
import urllib.parse
from typing import TYPE_CHECKING, Optional

from tibiapy import InvalidContentError, errors
from tibiapy.builders import (
    CMPostArchiveBuilder,
    ForumAnnouncementBuilder,
    ForumBoardBuilder,
    ForumThreadBuilder,
)
from tibiapy.enums import ThreadStatus, Vocation
from tibiapy.models import (
    AnnouncementEntry,
    BoardEntry,
    CMPost,
    CMPostArchive,
    ForumAnnouncement,
    ForumAuthor,
    ForumBoard,
    ForumEmoticon,
    ForumPost,
    ForumSection,
    ForumThread,
    GuildMembership,
    LastPost,
    ThreadEntry,
)
from tibiapy.utils import (
    clean_text,
    convert_line_breaks,
    get_rows,
    parse_form_data,
    parse_integer,
    parse_link_info,
    parse_pagination,
    parse_tables_map,
    parse_tibia_datetime,
    parse_tibia_forum_datetime,
    parse_tibiacom_content,
    split_list,
    try_enum,
)

if TYPE_CHECKING:
    import bs4

__all__ = (
    "CMPostArchiveParser",
    "ForumAnnouncementParser",
    "ForumBoardParser",
    "ForumSectionParser",
    "ForumThreadParser",
)

timezone_regex = re.compile(r"times are (CES?T)")
filename_regex = re.compile(r"(\w+.gif)")
pages_regex = re.compile(r"\(Pages[^)]+\)")

author_info_regex = re.compile(r"Inhabitant of (\w+)\nVocation: ([\w\s]+)\nLevel: (\d+)")
author_posts_regex = re.compile(r"Posts: (\d+)")
guild_regexp = re.compile(r"([\s\w()]+)\sof the\s(.+)")
guild_title_regexp = re.compile(r"([^(]+)\s\(([^)]+)\)")
post_dates_regex = re.compile(r"(\d{2}\.\d{2}\.\d{4}\s\d{2}:\d{2}:\d{2})")
edited_by_regex = re.compile(r"Edited by (.*) on \d{2}")

signature_separator = "________________"

FORUM_POSITIONS = ["Tutor", "Community Manager", "Customer Support", "Programmer", "Game Content Designer", "Tester"]
"""Special positions displayed for characters in the forums."""



[docs]
class CMPostArchiveParser:
    """Parser for the content of the CM Post Archive page in Tibia.com."""


[docs]
    @classmethod
    def from_content(cls, content: str) -> CMPostArchive:
        """Parse the content of the CM Post Archive page from Tibia.com.

        Parameters
        ----------
        content:
            The HTML content of the CM Post Archive in Tibia.com

        Returns
        -------
            The CM Post archive found in the page.

        Raises
        ------
        InvalidContent
            If content is not the HTML content of the CM Post Archive in Tibia.com

        """
        parsed_content = parse_tibiacom_content(content)

        form = parsed_content.select_one("form")
        try:
            (start_month_selector, start_day_selector, start_year_selector,
             end_month_selector, end_day_selector, end_year_selector) = form.select("select")
            start_date = cls._get_selected_date(start_month_selector, start_day_selector, start_year_selector)
            end_date = cls._get_selected_date(end_month_selector, end_day_selector, end_year_selector)
        except (AttributeError, ValueError) as e:
            raise errors.InvalidContentError("content does not belong to the CM Post Archive in Tibia.com", e) from e

        builder = CMPostArchiveBuilder().from_date(start_date).to_date(end_date)
        table = parsed_content.select_one("table.Table3")
        if not table:
            return builder.build()

        inner_table_container = table.select_one("div.InnerTableContainer")
        inner_table = inner_table_container.select_one("table")
        inner_table_rows = get_rows(inner_table)
        inner_table_rows = [e for e in inner_table_rows if e.parent == inner_table]
        table_content = inner_table_container.select_one("table.TableContent")

        header_row, *rows = get_rows(table_content)

        for row in rows:
            columns = row.select("td")
            date_column = columns[0]
            date = parse_tibia_datetime(clean_text(date_column))
            board_thread_column = columns[1]
            convert_line_breaks(board_thread_column)
            board, thread = board_thread_column.text.splitlines()
            link_column = columns[2]
            post_link_tag = link_column.select_one("a")
            post_link = parse_link_info(post_link_tag)
            post_id = int(post_link["query"]["postid"])
            builder.add_entry(CMPost(posted_on=date, board=board, thread_title=thread, post_id=post_id))

        if not rows:
            return builder.build()

        page, total_pages, results = parse_pagination(inner_table_rows[-1])
        builder.current_page(page).total_pages(total_pages).results_count(results)
        return builder.build()


    # endregion

    # region Private Methods

    @classmethod
    def _get_selected_date(
            cls,
            month_selector: bs4.Tag,
            day_selector: bs4.Tag,
            year_selector: bs4.Tag,
    ) -> Optional[datetime.date]:
        """Get the date made from the selected options in the selectors.

        Parameters
        ----------
        month_selector: :class:`bs4.Tag`
            The month selector.
        day_selector: :class:`bs4.Tag`
            The day selector.
        year_selector: :class:`bs4.Tag`
            The year selector.

        Returns
        -------
        :class:`datetime.date`
            The selected date.

        """
        selected_month = month_selector.select_one("option[selected]") or month_selector.select_one("option")
        selected_day = day_selector.select_one("option[selected]") or day_selector.select_one("option")
        selected_year = year_selector.select_one("option[selected]") or year_selector.select_one("option")
        try:
            return datetime.date(year=int(selected_year["value"]), month=int(selected_month["value"]),
                                 day=int(selected_day["value"]))
        except ValueError:
            return None

    # endregion



[docs]
class ForumSectionParser:
    """Parser for forum sections, such as world boards, trade boards, etcetera."""


[docs]
    @classmethod
    def from_content(cls, content: str) -> ForumSection:
        """Parse a forum section from Tibia.com.

        Parameters
        ----------
        content:
            The HTML content from Tibia.com

        Returns
        -------
            The forum section found in the page.

        """
        parsed_content = parse_tibiacom_content(content)
        tables = parse_tables_map(parsed_content)
        if "Boards" not in tables:
            raise InvalidContentError("Boards table not found.")

        rows = tables["Boards"].select("table.TableContent > tr:not(.LabelH)")
        section_link = parse_link_info(parsed_content.select_one("p.ForumWelcome > a"))
        redirect = section_link["query"]["redirect"]
        redirect_qs = urllib.parse.parse_qs(urllib.parse.urlparse(redirect).query)
        section_id = redirect_qs["sectionid"][0]
        time_label = parsed_content.select_one("div.CurrentTime")
        offset = 2 if "CEST" in time_label.text else 1
        boards = [board for row in rows if (board := cls._parse_board_row(row, offset)) is not None]
        return ForumSection(section_id=int(section_id), entries=boards)


    @classmethod
    def _parse_board_row(cls, board_row: bs4.Tag, offset: int = 1) -> Optional[BoardEntry]:
        """Parse a row containing a board and extracts its information.

        Parameters
        ----------
        board_row: :class:`bs4.Tag`
            The row's parsed content.
        offset: :class:`int`
            Since the displayed dates do not contain information, it is neccessary to extract the used timezone from
            somewhere else and pass it to this method to adjust them accordingly.

        Returns
        -------
        :class:`BoardEntry`
            The board contained in this row.

        """
        columns = board_row.select("td")
        # Second Column: Name and description
        if len(columns) < 5:
            return None

        name_column = columns[1]
        board_link_tag = name_column.select_one("a")
        description_tag = name_column.select_one("font")
        description = description_tag.text
        board_link = parse_link_info(board_link_tag)
        name = board_link["text"]
        board_id = int(board_link["query"]["boardid"])
        # Third Column: Post count
        posts_column = columns[2]
        posts = parse_integer(posts_column.text)
        # Fourth Column: View count
        threads_column = columns[3]
        threads = parse_integer(threads_column.text)
        # Fifth Column: Last post information
        last_post_column = columns[4]
        last_post = LastPostParser._parse_column(last_post_column, offset)
        return BoardEntry(name=name, board_id=board_id, description=description, posts=posts, threads=threads,
                          last_post=last_post)




[docs]
class ForumAnnouncementParser:
    """Parser for forum announcements posted by CipSoft."""


[docs]
    @classmethod
    def from_content(cls, content: str, announcement_id: int = 0) -> Optional[ForumAnnouncement]:
        """Parse the content of an announcement's page from Tibia.com.

        Parameters
        ----------
        content:
            The HTML content of an announcement in Tibia.com
        announcement_id:
            The id of the announcement. Since there is no way to obtain the id from the page,
            the id may be passed to assign.

        Returns
        -------
            The announcement contained in the page or :obj:`None` if not found.

        Raises
        ------
        InvalidContent
            If content is not the HTML content of an announcement page in Tibia.com

        """
        parsed_content = parse_tibiacom_content(content)

        forum_breadcrumbs = parsed_content.select_one("div.ForumBreadCrumbs")
        if not forum_breadcrumbs:
            message_box = parsed_content.select_one("div.TableContainer")
            if not message_box or "error" not in message_box.text.lower():
                raise errors.InvalidContentError("content is not a Tibia.com forum announcement.")

            return None

        section_link, board_link, *_ = forum_breadcrumbs.select("a")
        section_link_info = parse_link_info(section_link)
        section = section_link_info["text"]
        section_id = parse_integer(section_link_info["query"]["sectionid"])
        board_link_info = parse_link_info(board_link)
        board = board_link_info["text"]
        board_id = parse_integer(board_link_info["query"]["boardid"])

        builder = (ForumAnnouncementBuilder()
                   .section(section)
                   .section_id(section_id)
                   .board(board)
                   .board_id(board_id)
                   .announcement_id(announcement_id))

        times_container = parsed_content.select_one("div.ForumContentFooterLeft")
        offset = 2 if times_container.text == "CEST" else 1

        post_container = parsed_content.select_one("div.ForumPost")
        character_info_container = post_container.select_one("div.PostCharacterText")
        builder.author(ForumAuthorParser._parse_author_table(character_info_container))
        post_text_container = post_container.select_one("div.PostText")
        title_tag = post_text_container.select_one("b")
        builder.title(title_tag.text)
        dates_container = post_text_container.select_one("font")
        dates = post_dates_regex.findall(dates_container.text)
        announcement_content = post_text_container.encode_contents().decode()
        _, announcement_content = announcement_content.split("<hr/>", 1)
        builder.content(announcement_content)

        start_date, end_date = (parse_tibia_forum_datetime(date, offset) for date in dates)
        builder.from_date(start_date).to_date(end_date)
        return builder.build()




class ForumAuthorParser:

    @classmethod
    def _parse_author_table(cls, character_info_container: bs4.Tag) -> ForumAuthor:
        """Parse the table containing the author's information.

        Parameters
        ----------
        character_info_container: :class:`bs4.Tag`
            The cotnainer with the character's information.

        Returns
        -------
        :class:`ForumAuthor`
            The author's information.

        """
        # First link belongs to character
        char_link = character_info_container.select_one("a")
        if not char_link:
            name = character_info_container.text
            deleted = True
            traded = False
            if "(traded)" in name:
                name = name.replace("(traded)", "").strip()
                deleted = False
                traded = True

            return ForumAuthor(name=name, is_author_deleted=deleted, is_author_traded=traded)

        author = ForumAuthor(name=char_link.text)
        char_info = character_info_container.select_one("font.ff_infotext")
        position_info = character_info_container.select_one("font.ff_smallinfo")
        # Position and titles are shown the same way. If we have two, the title is first and then the position.
        # However, if the character only has one of them, there's no way to know which is it unless we validate against
        # possible types
        if position_info and (not char_info or position_info.parent != char_info):
            convert_line_breaks(position_info)
            titles = [title for title in position_info.text.splitlines() if title]
            for _title in titles:
                if _title in FORUM_POSITIONS:
                    author.position = _title
                else:
                    author.title = _title

        guild_info = char_info.select_one("font.ff_smallinfo")
        convert_line_breaks(char_info)
        char_info_text = char_info.text
        if info_match := author_info_regex.search(char_info_text):
            author.world = info_match.group(1)
            author.vocation = try_enum(Vocation, info_match.group(2))
            author.level = int(info_match.group(3))

        if guild_info:
            guild_match = guild_regexp.search(guild_info.text)
            guild_name = guild_match.group(2)
            title_match = guild_title_regexp.search(guild_name)
            title = None

            if title_match:
                guild_name = title_match.group(1)
                title = title_match.group(2)

            author.guild = GuildMembership(name=guild_name, rank=guild_match.group(1), title=title)

        author.posts = int(author_posts_regex.search(char_info_text).group(1))
        return author



[docs]
class ForumBoardParser:
    """A parser for forum boards from Tibia.com."""


[docs]
    @classmethod
    def from_content(cls, content: str) -> Optional[ForumBoard]:
        """Parse the board's HTML content from Tibia.com.

        Parameters
        ----------
        content:
            The HTML content of the board.

        Returns
        -------
            The forum board contained.

        Raises
        ------
        InvalidContent`
            Content is not a board in Tibia.com

        """
        parsed_content = parse_tibiacom_content(content)
        forum_breadcrumbs = parsed_content.select_one("div.ForumBreadCrumbs")
        if not forum_breadcrumbs:
            message_box = parsed_content.select_one("div.InnerTableContainer")
            if not message_box or "board you requested" not in message_box.text:
                raise errors.InvalidContentError("content does not belong to a board.")

            return None

        tables = parsed_content.select("table.TableContent")

        header_text = forum_breadcrumbs.text.strip()
        section, name = split_list(header_text, "|", "|")
        link_info = parse_link_info(forum_breadcrumbs.select_one("a"))
        section_id = int(link_info["query"]["sectionid"])

        builder = ForumBoardBuilder().name(name).section(section).section_id(section_id)

        forms = parsed_content.select("form")
        post_age_form = forms[0]
        data = parse_form_data(post_age_form)
        if "threadage" in data.values:
            builder.age(parse_integer(data.values["threadage"]))
        else:
            return builder.build()

        pagination_block = parsed_content.select_one("small")
        pages, total, count = parse_pagination(pagination_block) if pagination_block else (0, 0, 0)
        builder.current_page(pages)
        builder.total_pages(total)

        *thread_rows, times_row = get_rows(tables[-1])
        for thread_row in thread_rows[1:]:
            columns = thread_row.select("td")
            entry = cls._parse_thread_row(columns)
            if "ClassifiedProposal" in thread_row.attrs.get("class"):
                entry.golden_frame = True

            builder.add_entry(entry)

        if len(tables) > 1:
            announcement_rows = get_rows(tables[0])
            for announcement_row in announcement_rows[1:]:
                author_link, title_link = announcement_row.select("a")
                author = author_link.text.strip()
                announcement_link = parse_link_info(title_link)
                entry = AnnouncementEntry(
                    title=announcement_link["text"],
                    announcement_id=int(announcement_link["query"]["announcementid"]),
                    announcement_author=author,
                )
                builder.add_announcement(entry)

        if len(forms) > 2:
            board_selector_form = forms[2]
            data = parse_form_data(board_selector_form)
            builder.board_id(parse_integer(data.values["boardid"]))

        return builder.build()


    # endregion

    # region Private Methods

    @classmethod
    def _parse_thread_row(cls, columns: bs4.ResultSet) -> Optional[ThreadEntry]:
        """Parse the thread row, containing a single thread.

        Parameters
        ----------
        columns: :class:`bs4.ResultSet`
            The list of columns the thread contains.

        Returns
        -------
        :class:`ThreadEntry`

        """
        # First Column: Thread's status
        status = None
        status_column = columns[0]
        status_img = status_column.select_one("img")
        status_icon = None
        if status_img:
            url = status_img["src"]
            filename = filename_regex.search(url).group(1)
            status_icon = url
            status = ThreadStatus.from_icon(filename)
        # Second column: Thread's emoticon
        emoticon = None
        emoticon_column = columns[1]
        emoticon_img = emoticon_column.select_one("img")
        if emoticon_img and emoticon_img.get("alt"):
            url = emoticon_img["src"]
            name = emoticon_img["alt"]
            emoticon = ForumEmoticon(name=name, url=url)
        # Third Column: Thread's title and number of pages
        pages = 1
        thread_column = columns[2]
        title = thread_column.text.strip()
        try:
            thread_link, *page_links = thread_column.select("a")
        except ValueError:
            return None

        if page_links:
            last_page_link = page_links[-1]
            link_info = parse_link_info(last_page_link)
            pages = int(link_info["query"]["pagenumber"])
            title = pages_regex.sub("", title).strip()

        link_info = parse_link_info(thread_link)
        thread_id = int(link_info["query"]["threadid"])
        # Fourth Column: Thread starter
        thread_starter_column = columns[3]
        thread_starter = thread_starter_column.text.strip()
        # Fifth Column: Number of replies
        replies_column = columns[4]
        replies = parse_integer(replies_column.text)
        # Sixth Column: Number of views
        views_column = columns[5]
        views = parse_integer(views_column.text)
        # Seventh Column: Last post information
        last_post_column = columns[6]
        last_post = LastPostParser._parse_column(last_post_column)

        author_link = columns[3].select_one("a")
        traded = False
        if "(traded)" in thread_starter:
            traded = True
            thread_starter = thread_starter.replace("(traded)", "").strip()

        return ThreadEntry(
            title=title,
            thread_id=thread_id,
            thread_starter=thread_starter,
            replies=replies,
            views=views,
            last_post=last_post,
            emoticon=emoticon,
            status=status,
            total_pages=pages,
            status_icon=status_icon,
            thread_starter_traded=traded,
            thread_starter_deleted=author_link is None and not traded,
        )


    # endregion



[docs]
class ForumThreadParser:
    """A parser for forum threads from Tibia.com."""


[docs]
    @classmethod
    def from_content(cls, content: str) -> Optional[ForumThread]:
        """Create an instance of the class from the html content of the thread's page.

        Parameters
        ----------
        content:
            The HTML content of the page.

        Returns
        -------
            The thread contained in the page, or None if the thread doesn't exist

        Raises
        ------
        InvalidContent
            If content is not the HTML of a thread's page.

        """
        parsed_content = parse_tibiacom_content(content)
        forum_breadcrumbs = parsed_content.select_one("div.ForumBreadCrumbs")
        if not forum_breadcrumbs:
            message_box = parsed_content.select_one("div.InnerTableContainer")
            if not message_box or "not found" not in message_box.text:
                raise errors.InvalidContentError("content does not belong to a thread.")

            return None

        header_text = forum_breadcrumbs.text.strip()
        section_link, board_link = (parse_link_info(t) for t in forum_breadcrumbs.select("a"))
        section, board, partial_title = split_list(header_text, "|", "|")

        builder = (ForumThreadBuilder()
                   .section(section)
                   .section_id(int(section_link["query"]["sectionid"]))
                   .board_id(int(board_link["query"]["boardid"]))
                   .board(board))
        forum_title_container = parsed_content.select_one("div.ForumTitleText")
        if not forum_title_container:
            builder.title(partial_title)
            return builder.build()

        builder.title(forum_title_container.text.strip())

        border = forum_title_container.parent.previous_sibling.previous_sibling
        gold_frame = "gold" in border["style"]
        builder.golden_frame(gold_frame)

        pagination_block = parsed_content.select_one("td.PageNavigation")
        pages, total, count = parse_pagination(pagination_block) if pagination_block else (0, 0, 0)
        builder.current_page(pages)
        builder.total_pages(total)

        posts_table = parsed_content.select_one("table.TableContent")

        thread_info_container = posts_table.select_one("div.ForumPostHeader")
        thread_info_text_container = thread_info_container.select_one("div.ForumPostHeaderText")
        thread_number, navigation_container = thread_info_text_container.children
        builder.thread_id(int(thread_number.split("#")[-1]))
        navigation_links = navigation_container.select("a")
        if len(navigation_links) == 2:
            prev_link_tag, next_link_tag = navigation_links
            prev_link = parse_link_info(prev_link_tag)
            builder.previous_topic_number(int(prev_link["query"]["threadid"]))
            next_link = parse_link_info(next_link_tag)
            builder.next_topic_number(int(next_link["query"]["threadid"]))
        elif "Previous" in navigation_links[0].text:
            prev_link = parse_link_info(navigation_links[0])
            builder.previous_topic_number(int(prev_link["query"]["threadid"]))
        else:
            next_link = parse_link_info(navigation_links[0])
            builder.next_topic_number(int(next_link["query"]["threadid"]))

        times_container = posts_table.select_one("div.ForumContentFooterLeft")
        offset = 2 if times_container.text == "CEST" else 1

        post_containers = posts_table.select("div.PostBody")
        for post_container in post_containers:
            post = cls._parse_post_table(post_container, offset)
            builder.add_entry(post)

        return builder.build()


    # endregion

    # region Private Methods

    @classmethod
    def _parse_post_table(cls, post_table: bs4.Tag, offset: int = 1) -> ForumPost:
        """Parse the table containing a single posts, extracting its information.

        Parameters
        ----------
        post_table: :class:`bs4.Tag`
            The parsed HTML content of the table.
        offset: :class:`int`
            The UTC offset used for the timestamps.

            Since the timestamps found in the post contain no timezone information, the offset is extracted from
            another section and passed here to adjust them accordingly.

        Returns
        -------
        :class:`ForumPost`
            The post contained in the table.

        """
        golden_frame = "CipPostWithBorderImage" in post_table.parent.attrs.get("class")
        character_info_container = post_table.select_one("div.PostCharacterText")
        post_author = ForumAuthorParser._parse_author_table(character_info_container)
        content_container = post_table.select_one("div.PostText")
        emoticon = None
        title_tag = None
        # The first elements are the emoticon, the title, and the separator.
        while True:
            child = next(content_container.children)
            child.extract()
            if child.name == "img":
                emoticon = ForumEmoticon(name=child["alt"], url=child["src"])
            elif child.name == "b":
                title_tag = child
            elif child.name == "div":
                break
        # Remove the first line jump found in post content
        first_break = content_container.select_one("br")
        if first_break:
            first_break.extract()

        title = None
        signature = None
        signature_container = post_table.select_one("td.ff_pagetext")
        if signature_container:
            # Remove the signature's content from content container
            signature_container.extract()
            signature = signature_container.encode_contents().decode()

        content = content_container.encode_contents().decode()
        if signature_container:
            # The signature separator will still be part of the content container, so we remove it
            parts = content.split(signature_separator)
            # This will handle the post containing another signature separator within the content
            # We join back all the pieces except for the last one
            content = signature_separator.join(parts[:-1])

        if title_tag:
            title = title_tag.text

        post_details = post_table.select_one("div.PostDetails")
        dates = post_dates_regex.findall(post_details.text)
        edited_date = None
        edited_by = None
        posted_date = parse_tibia_forum_datetime(dates[0], offset)
        if len(dates) > 1:
            edited_date = parse_tibia_forum_datetime(dates[1], offset)
            edited_by = edited_by_regex.search(post_details.text).group(1)

        post_details = post_table.select_one("div.AdditionalBox")
        post_number = post_details.text.replace("Post #", "")
        post_id = int(post_number)
        return ForumPost(author=post_author, content=content, signature=signature, posted_date=posted_date,
                         edited_date=edited_date, edited_by=edited_by, post_id=post_id, title=title, emoticon=emoticon,
                         golden_frame=golden_frame)


    # endregion


class LastPostParser:

    @classmethod
    def _parse_column(cls, last_post_column: bs4.Tag, offset: int = 1) -> Optional[LastPost]:
        """Parse the column containing the last post information and extracts its data.

        Parameters
        ----------
        last_post_column: :class:`bs4.Tag`:
            The column containing the last post.
        offset: :class:`int`
            Since the timestamps have no offset information, it may be passed to fill it out.

        Returns
        -------
        Optional[:class:`LastPost`]:
            The last post described in the column, if any.

        """
        last_post_info = last_post_column.select_one("div.LastPostInfo, span.LastPostInfo")
        if last_post_info is None:
            return None

        permalink_tag = last_post_info.select_one("a")
        permalink_info = parse_link_info(permalink_tag)
        post_id = int(permalink_info["query"]["postid"])
        date_text = clean_text(last_post_info)
        last_post_date = parse_tibia_forum_datetime(date_text, offset)

        last_post_author_tag = last_post_column.select_one("font")
        author_link = last_post_author_tag.select_one("a")
        deleted = author_link is None
        author = clean_text(last_post_author_tag).replace("by", "", 1)
        traded = False
        if "(traded)" in author:
            author = author.replace("(traded)", "").strip()
            traded = True
            deleted = False

        return LastPost(author=author, post_id=post_id, posted_on=last_post_date, is_author_deleted=deleted,
                        is_author_traded=traded)
Source code for tibiapy.parsers.forum

Tibia.py

Navigation

Related Topics

Donate/support