Reference

API reference for the functions exported by ParShift.

Parshift

Source code in parshift/oo_parshift.py

class Parshift:
    def __init__(
        self,
        annotation: pd.DataFrame | None = None,
        stats: pd.DataFrame | List[pd.DataFrame] | None = None,
    ):
        """Parshift initialization"""

        self.annotation = annotation
        self.stats = stats

    def process(
        self,
        filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
        N: int = 1,
        **kwargs: Any,
    ):
        """Read a conversation file in CSV format, validate it,
        get Gibson's participation shift codes from turns in a conversation,
        determine the conditional probabilities for a sequence of participation shift codes
        and return a dict with parshift annotations and conditional probabilities.

        The conversation file should have the following columns:

        - `utterance_id`: ID of the message (int)
        - `speaker_id`: ID of the user sending the message (str)
        - `utterance`: The message itself (string)
        - `reply_to_id` or `target_id`: The reply ID or the target ID (int)

        Arguments:
            filepath_or_buffer: Any valid string path to CSV file, as accepted by
                Pandas [`read_csv()`][pandas.read_csv] function.
            N: Number of parts to split the conversation into. Default is 1 (all conversation).
                `N` should be between 1 and 4.
            **kwargs: Keyword parameters passed to Pandas
                [`read_csv()`][pandas.read_csv] function.

        - Parshift.annotation will be data frame equal as returned by [`annotate()`][parshift.annotation.annotate].
        - Parshift.stats will be data frame equal as returned by [`cond_probs()`][parshift.statistics.cond_probs].
        """

        df_annotate = annotate(read_ccsv(filepath_or_buffer, **kwargs))
        self.annotation = df_annotate

        if N == 1:
            self.stats = cond_probs(df_annotate)
        elif N in [2, 3, 4]:
            list_stats = []
            size = len(df_annotate)
            parts = size / N
            for i in range(N):
                # Get all the rows from parts*i to size*(i+1) with all columns
                start = int(parts * i)
                end = int(parts * (i + 1))
                list_stats.append(cond_probs(df_annotate.iloc[start:end, :]))
            self.stats = list_stats
        else:
            raise ValueError("N should be between 1 and 4.")

    def show_plot(self, type: str = "Pshift", filename: str | None = None):
        """Shows the frequency treemap plot returned by [`frequency_treemap()`][parshift.plotting.frequency_treemap]

        Arguments:
            type: Column name to be used to plot the treemap, either `"Pshift"`
                (default) or `"Pshift_class"`.
            filename: Name of the file to save the plot. Default to `None` .

        """

        if self.stats is None:
            raise ValueError(
                "Parshift.stats is None. Please run Parshift.process() first."
            )

        if not isinstance(type, str):
            raise TypeError("Parameter type must be a String")
        if type not in ["Pshift_class", "Pshift"]:
            raise ValueError(
                "Parameter type must be one of the following: `Pshift`, `Pshift_class`"
            )

        if filename != None and not isinstance(filename, str):
            raise TypeError("Parameter filename must be a String")

        if type == "Pshift":
            if isinstance(self.stats, list):
                _, ax = plt.subplots(
                    1, len(self.stats), figsize=(5 * len(self.stats), 5)
                )

                for i in range(len(self.stats)):
                    frequency_treemap(self.stats[i], type=type, ax=ax[i])
                    ax[i].axis("off")
                    ax[i].set_title(f"n {i+1}")
            else:
                ax = frequency_treemap(self.stats, type=type)

            plt.suptitle("Participation-Shift Frequencies")

        elif type == "Pshift_class":
            if isinstance(self.stats, list):
                _, ax = plt.subplots(
                    1, len(self.stats), figsize=(5 * len(self.stats), 5)
                )

                for i in range(len(self.stats)):
                    frequency_treemap(self.stats[i], type=type, ax=ax[i])
                    ax[i].axis("off")
                    ax[i].set_title(f"n {i+1}")
            else:
                ax = frequency_treemap(self.stats, type=type)

            plt.suptitle("Participation Shifts: Class Proportions")

        if filename:
            if ".png" not in filename:
                filename += ".png"
            plt.savefig(filename, dpi=300)

        plt.show()

    def show_stats(self, filename: str | None = None):
        """Prints the stats returned by [`cond_probs()`][parshift.statistics.cond_probs]
        Dataframe. If kwarg N (see [`process`][parshift.Parshift.process]) > 1, prints N data frames.

        Arguments:
            filename: Name of the file (csv) to save the stats data frame. Default to `None`.
        """

        if self.stats is None:
            raise ValueError(
                "Parshift.stats is None. Please run Parshift.process() first."
            )

        if isinstance(self.stats, list):
            for i in range(len(self.stats)):
                print(f"n{i+1}:")
                print(self.stats[i])
                print("-" * 80)

                if filename:
                    if ".csv" not in filename:
                        filename_changed = f"{filename}_n{i+1}.csv"
                    else:
                        filename_changed = filename.replace(".csv", f"_n{i+1}.csv")
                    self.stats[i].to_csv(filename_changed, index=False)

        else:
            print(self.stats)
            if filename:
                if ".csv" not in filename:
                    filename += ".csv"
                self.stats.to_csv(filename, index=False)

    def get_propensities(self, filename: str | None = None) -> pd.DataFrame:
        """Returns a data frame with the Participation Shift propensities.

        Arguments:
            filename: Name of the file (csv) to save the propensities data frame. Default to `None`.

        Returns:
            A Pandas [`DataFrame`][pandas.DataFrame] containing the propensities.
        """

        if self.stats is None:
            raise ValueError(
                "Parshift.stats is None. Please run Parshift.process() first."
            )

        if isinstance(self.stats, list):
            df = propensities(self.stats[0])
            df.index = ["n1"]  # type: ignore
            for i in range(1, len(self.stats)):
                dfx = propensities(self.stats[i])
                dfx.index = [f"n{i+1}"]  # type: ignore
                df = pd.concat([df, dfx])

            if filename:
                if ".csv" not in filename:
                    filename += ".csv"
                df.to_csv(filename, index=False)
            return df

        else:
            df = propensities(self.stats)
            df.index = ["n"]  # type: ignore

            if filename:
                if ".csv" not in filename:
                    filename += ".csv"
                df.to_csv(filename, index=False)
            return df

init

__init__(
    annotation: pd.DataFrame | None = None,
    stats: pd.DataFrame | List[pd.DataFrame] | None = None,
)

Parshift initialization

Source code in parshift/oo_parshift.py

def __init__(
    self,
    annotation: pd.DataFrame | None = None,
    stats: pd.DataFrame | List[pd.DataFrame] | None = None,
):
    """Parshift initialization"""

    self.annotation = annotation
    self.stats = stats

get_propensities

get_propensities(filename: str | None = None) -> pd.DataFrame

Returns a data frame with the Participation Shift propensities.

Parameters:

Name	Type	Description	Default
`filename`	`str \| None`	Name of the file (csv) to save the propensities data frame. Default to `None`.	`None`

Returns:

Type	Description
`DataFrame`	A Pandas `DataFrame` containing the propensities.

Source code in parshift/oo_parshift.py

def get_propensities(self, filename: str | None = None) -> pd.DataFrame:
    """Returns a data frame with the Participation Shift propensities.

    Arguments:
        filename: Name of the file (csv) to save the propensities data frame. Default to `None`.

    Returns:
        A Pandas [`DataFrame`][pandas.DataFrame] containing the propensities.
    """

    if self.stats is None:
        raise ValueError(
            "Parshift.stats is None. Please run Parshift.process() first."
        )

    if isinstance(self.stats, list):
        df = propensities(self.stats[0])
        df.index = ["n1"]  # type: ignore
        for i in range(1, len(self.stats)):
            dfx = propensities(self.stats[i])
            dfx.index = [f"n{i+1}"]  # type: ignore
            df = pd.concat([df, dfx])

        if filename:
            if ".csv" not in filename:
                filename += ".csv"
            df.to_csv(filename, index=False)
        return df

    else:
        df = propensities(self.stats)
        df.index = ["n"]  # type: ignore

        if filename:
            if ".csv" not in filename:
                filename += ".csv"
            df.to_csv(filename, index=False)
        return df

process

process(
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    N: int = 1,
    **kwargs: Any
)

Read a conversation file in CSV format, validate it, get Gibson's participation shift codes from turns in a conversation, determine the conditional probabilities for a sequence of participation shift codes and return a dict with parshift annotations and conditional probabilities.

The conversation file should have the following columns:

utterance_id: ID of the message (int)
speaker_id: ID of the user sending the message (str)
utterance: The message itself (string)
reply_to_id or target_id: The reply ID or the target ID (int)

Parameters:

Name	Type	Description	Default
`filepath_or_buffer`	`FilePath \| ReadCsvBuffer[bytes] \| ReadCsvBuffer[str]`	Any valid string path to CSV file, as accepted by Pandas `read_csv()` function.	required
`N`	`int`	Number of parts to split the conversation into. Default is 1 (all conversation). `N` should be between 1 and 4.	`1`
`**kwargs`	`Any`	Keyword parameters passed to Pandas `read_csv()` function.	`{}`

Parshift.annotation will be data frame equal as returned by annotate().
Parshift.stats will be data frame equal as returned by cond_probs().

Source code in parshift/oo_parshift.py

def process(
    self,
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    N: int = 1,
    **kwargs: Any,
):
    """Read a conversation file in CSV format, validate it,
    get Gibson's participation shift codes from turns in a conversation,
    determine the conditional probabilities for a sequence of participation shift codes
    and return a dict with parshift annotations and conditional probabilities.

    The conversation file should have the following columns:

    - `utterance_id`: ID of the message (int)
    - `speaker_id`: ID of the user sending the message (str)
    - `utterance`: The message itself (string)
    - `reply_to_id` or `target_id`: The reply ID or the target ID (int)

    Arguments:
        filepath_or_buffer: Any valid string path to CSV file, as accepted by
            Pandas [`read_csv()`][pandas.read_csv] function.
        N: Number of parts to split the conversation into. Default is 1 (all conversation).
            `N` should be between 1 and 4.
        **kwargs: Keyword parameters passed to Pandas
            [`read_csv()`][pandas.read_csv] function.

    - Parshift.annotation will be data frame equal as returned by [`annotate()`][parshift.annotation.annotate].
    - Parshift.stats will be data frame equal as returned by [`cond_probs()`][parshift.statistics.cond_probs].
    """

    df_annotate = annotate(read_ccsv(filepath_or_buffer, **kwargs))
    self.annotation = df_annotate

    if N == 1:
        self.stats = cond_probs(df_annotate)
    elif N in [2, 3, 4]:
        list_stats = []
        size = len(df_annotate)
        parts = size / N
        for i in range(N):
            # Get all the rows from parts*i to size*(i+1) with all columns
            start = int(parts * i)
            end = int(parts * (i + 1))
            list_stats.append(cond_probs(df_annotate.iloc[start:end, :]))
        self.stats = list_stats
    else:
        raise ValueError("N should be between 1 and 4.")

show_plot

show_plot(type: str = 'Pshift', filename: str | None = None)

Shows the frequency treemap plot returned by frequency_treemap()

Parameters:

Name	Type	Description	Default
`type`	`str`	Column name to be used to plot the treemap, either `"Pshift"` (default) or `"Pshift_class"`.	`'Pshift'`
`filename`	`str \| None`	Name of the file to save the plot. Default to `None` .	`None`

Source code in parshift/oo_parshift.py

def show_plot(self, type: str = "Pshift", filename: str | None = None):
    """Shows the frequency treemap plot returned by [`frequency_treemap()`][parshift.plotting.frequency_treemap]

    Arguments:
        type: Column name to be used to plot the treemap, either `"Pshift"`
            (default) or `"Pshift_class"`.
        filename: Name of the file to save the plot. Default to `None` .

    """

    if self.stats is None:
        raise ValueError(
            "Parshift.stats is None. Please run Parshift.process() first."
        )

    if not isinstance(type, str):
        raise TypeError("Parameter type must be a String")
    if type not in ["Pshift_class", "Pshift"]:
        raise ValueError(
            "Parameter type must be one of the following: `Pshift`, `Pshift_class`"
        )

    if filename != None and not isinstance(filename, str):
        raise TypeError("Parameter filename must be a String")

    if type == "Pshift":
        if isinstance(self.stats, list):
            _, ax = plt.subplots(
                1, len(self.stats), figsize=(5 * len(self.stats), 5)
            )

            for i in range(len(self.stats)):
                frequency_treemap(self.stats[i], type=type, ax=ax[i])
                ax[i].axis("off")
                ax[i].set_title(f"n {i+1}")
        else:
            ax = frequency_treemap(self.stats, type=type)

        plt.suptitle("Participation-Shift Frequencies")

    elif type == "Pshift_class":
        if isinstance(self.stats, list):
            _, ax = plt.subplots(
                1, len(self.stats), figsize=(5 * len(self.stats), 5)
            )

            for i in range(len(self.stats)):
                frequency_treemap(self.stats[i], type=type, ax=ax[i])
                ax[i].axis("off")
                ax[i].set_title(f"n {i+1}")
        else:
            ax = frequency_treemap(self.stats, type=type)

        plt.suptitle("Participation Shifts: Class Proportions")

    if filename:
        if ".png" not in filename:
            filename += ".png"
        plt.savefig(filename, dpi=300)

    plt.show()

show_stats

show_stats(filename: str | None = None)

Prints the stats returned by cond_probs() Dataframe. If kwarg N (see process) > 1, prints N data frames.

Parameters:

Name	Type	Description	Default
`filename`	`str \| None`	Name of the file (csv) to save the stats data frame. Default to `None`.	`None`

Source code in parshift/oo_parshift.py

def show_stats(self, filename: str | None = None):
    """Prints the stats returned by [`cond_probs()`][parshift.statistics.cond_probs]
    Dataframe. If kwarg N (see [`process`][parshift.Parshift.process]) > 1, prints N data frames.

    Arguments:
        filename: Name of the file (csv) to save the stats data frame. Default to `None`.
    """

    if self.stats is None:
        raise ValueError(
            "Parshift.stats is None. Please run Parshift.process() first."
        )

    if isinstance(self.stats, list):
        for i in range(len(self.stats)):
            print(f"n{i+1}:")
            print(self.stats[i])
            print("-" * 80)

            if filename:
                if ".csv" not in filename:
                    filename_changed = f"{filename}_n{i+1}.csv"
                else:
                    filename_changed = filename.replace(".csv", f"_n{i+1}.csv")
                self.stats[i].to_csv(filename_changed, index=False)

    else:
        print(self.stats)
        if filename:
            if ".csv" not in filename:
                filename += ".csv"
            self.stats.to_csv(filename, index=False)

annotate

annotate(conv_df: pd.DataFrame) -> pd.DataFrame

Get Gibson's participation shift codes from turns in a conversation.

Sequences of messages from a speaker to the same addressee are considered to be in the same turn, and therefore will be assigned a single participation shift code.

Parameters:

Name	Type	Description	Default
`conv_df`	`DataFrame`	The conversation from where to obtain the participation shift codes.	required

Returns:

Type	Description
`DataFrame`	A data frame with the participation shift codes for each turn.

Source code in parshift/annotation.py

def annotate(conv_df: pd.DataFrame) -> pd.DataFrame:
    """Get Gibson's participation shift codes from turns in a conversation.

    Sequences of messages from a speaker to the same addressee are considered to
    be in the same turn, and therefore will be assigned a single participation
    shift code.

    Arguments:
        conv_df: The conversation from where to obtain the participation shift codes.

    Returns:
        A data frame with the participation shift codes for each turn.
    """

    if not isinstance(conv_df, pd.DataFrame):
        raise TypeError("Parameter conv_df must be a Pandas DataFrame")

    conversation = conv2turns(conv_df)

    # part1 will take the parshift label for the previous turn
    part_1 = ""

    # part2 will take the parshift label for the current turn
    part_2 = ""

    if "reply_to_id" in conv_df.columns:
        annotate_df = pd.DataFrame(
            {
                "utterance_ids": [],
                "speaker_id": [],
                "utterance": [],
                "reply_to_id": [],
                "label_desc": [],
                "pshift": [],
            }
        )

        # calculate the participation shift for each turn
        for idx, msg in enumerate(conversation):
            if (
                msg["reply_to_id"] == None
                or msg["reply_to_id"] == "None"
                or msg["reply_to_id"] == ""
            ):
                part_2 = " " + str(msg["speaker_id"]) + " to group"
            else:
                for msgPrev in conversation[: idx + 1]:
                    if msg["reply_to_id"] in msgPrev["utterance_ids"]:
                        if (
                            msgPrev["reply_to_id"] == None
                            or msgPrev["reply_to_id"] == "None"
                            or msgPrev["reply_to_id"] == ""
                        ):
                            part_1 = str(msgPrev["speaker_id"]) + " to group,"

                        else:  # reply - reply
                            for msgPrev2 in conversation[:idx]:
                                if msgPrev["reply_to_id"] in msgPrev2["utterance_ids"]:
                                    part_1 = (
                                        str(msgPrev["speaker_id"])
                                        + " to "
                                        + str(msgPrev2["speaker_id"])
                                        + ","
                                    )

                        part_2 = (
                            " "
                            + str(msg["speaker_id"])
                            + " to "
                            + str(msgPrev["speaker_id"])
                        )

            # p1p2 takes the parshift label for the previous + current turn
            p1p2 = part_1 + part_2

            # part_1 takes the part_2 label for the next iteration
            part_1 = part_2[1:] + ","

            # set value to "" for first turn
            pshift_label = ""

            # we cannot calculate the pshift for the first turn
            if idx != 0:
                pshift_label = _pshift_code(p1p2)

            annotate_df.loc[len(annotate_df.index)] = [  # type: ignore
                str(msg["utterance_ids"]),
                str(msg["speaker_id"]),
                msg["utterance"],
                str(msg["reply_to_id"]),
                p1p2,
                pshift_label,
            ]

    elif "target_id" in conv_df.columns:
        annotate_df = pd.DataFrame(
            {
                "utterance_ids": [],
                "speaker_id": [],
                "utterance": [],
                "target_id": [],
                "label_desc": [],
                "pshift": [],
            }
        )

        # calculate the participation shift for each turn
        for idx, msg in enumerate(conversation):
            # if msg has no target, it is directed to the group
            if (
                msg["target_id"] == None
                or msg["target_id"] == "None"
                or msg["target_id"] == ""
            ):
                part_2 = " " + str(msg["speaker_id"]) + " to group"

            # if msg has a target, we save it
            else:
                part_2 = " " + str(msg["speaker_id"]) + " to " + str(msg["target_id"])

            # p1p2 takes the parshift label for the previous + current turn
            p1p2 = part_1 + part_2

            # part_1 takes the part_2 label for the next iteration
            part_1 = part_2[1:] + ","

            # set value to "" for first turn
            pshift_label = ""

            # we cannot calculate the pshift for the first turn
            if idx != 0:
                msg["label"] = p1p2
                pshift_label = _pshift_code(p1p2)
                msg["pshift"] = pshift_label

            annotate_df.loc[len(annotate_df.index)] = [  # type: ignore
                str(msg["utterance_ids"]),
                str(msg["speaker_id"]),
                msg["utterance"],
                str(msg["target_id"]),
                p1p2,
                pshift_label,
            ]

    annotate_df.drop(columns=["label_desc"], inplace=True)

    return annotate_df

cond_probs

cond_probs(pshift_codes: pd.DataFrame) -> pd.DataFrame

Determine the conditional probabilities for a sequence of participation shift codes.

Parameters:

Name	Type	Description	Default
`pshift_codes`	`DataFrame`	A sequence of participation shift code obtained with `annotate()`.	required

Returns:

Type Description

DataFrame

A data frame containing the frequency, probability and conditional probabilities (two) for each parshift code. This data frame is divided into two 'subgroups': (1) those beginning with an undirected remark (A0-); and, (2) those beginning with a directed one (AB-). The P(S|D) (Probability of a participation shift given a Directed or Undirected remark (D)) column contains the frequency divided by total occurrences in each subgroup, while the P(S|D,C) (Probability of a participation shift given a Directed or Undirected remark (D) and assuming Change of Speaker (C)) column contains the frequency divided by total occurrences in each subgroup, for each participation shift where the change of speaker occurs.

Source code in parshift/statistics.py

def cond_probs(pshift_codes: pd.DataFrame) -> pd.DataFrame:
    """Determine the conditional probabilities for a sequence of participation shift codes.

    Arguments:
        pshift_codes: A sequence of participation shift code obtained with
            [`annotate()`][parshift.annotation.annotate].

    Returns:
        A data frame containing the frequency, probability and conditional probabilities
            (two) for each parshift code. This data frame is divided into two 'subgroups':
            (1) those beginning with an undirected remark (A0-); and, (2) those beginning
            with a directed one (AB-). The `P(S|D)` (Probability of a participation shift
            given a Directed or Undirected remark (D)) column contains the frequency divided
            by total occurrences in each subgroup, while the `P(S|D,C)` (Probability of
            a participation shift given a Directed or Undirected remark (D) and assuming
            Change of Speaker (C)) column contains the frequency divided by total occurrences
            in each subgroup, for each participation shift where the change of speaker occurs.
    """

    if not isinstance(pshift_codes, pd.DataFrame):
        raise TypeError("Parameter parshift_annotation_df must be a Dataframe")

    frequency_table_and_counts = _frequency_table(pshift_codes)
    freq_table = frequency_table_and_counts[0]

    cond_prob = {}
    for key in freq_table:
        if key.split("-")[0] == "A0":
            if key not in ["A0-AY", "AB-A0", "AB-AY", "A0-A0"]:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[1], 2)
                    if frequency_table_and_counts[1] != 0
                    else 0,
                    "CPeTC": round(freq_table[key] / frequency_table_and_counts[3], 2)
                    if frequency_table_and_counts[3] != 0
                    else 0,
                }
            else:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[1], 2)
                    if frequency_table_and_counts[1] != 0
                    else 0,
                    "CPeTC": "",
                }
        else:
            if key not in ["A0-AY", "AB-A0", "AB-AY", "A0-A0"]:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[2], 2)
                    if frequency_table_and_counts[2] != 0
                    else 0,
                    "CPeTC": round(freq_table[key] / frequency_table_and_counts[4], 2)
                    if frequency_table_and_counts[4] != 0
                    else 0,
                }
            else:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[2], 2)
                    if frequency_table_and_counts[2] != 0
                    else 0,
                    "CPeTC": "",
                }

    cond_prob_df = pd.DataFrame.from_dict(cond_prob, orient="index")
    freq = pd.DataFrame.from_dict(freq_table, orient="index", columns=["Frequency"])
    freq["Probability"] = round(freq["Frequency"] / freq["Frequency"].sum(), 2)

    result = (
        pd.concat([freq, cond_prob_df], axis=1)
        .reset_index()
        .rename(columns={"index": "pshift"})
    )

    result = result.sort_values(
        by=["pshift"], key=lambda x: x.map(_cp_order)
    ).reset_index(drop=True)

    result = result.iloc[:, [0, 1, 2, 3, 4]]

    result["Change of Speaker (C)"] = result["pshift"].apply(
        lambda ps: _change_of_speaker(ps)
    )

    result["Directed Remark (D)"] = result["pshift"].apply(
        lambda ps: _targeted_remark(ps)
    )

    result.rename(
        columns={"pshift": "Pshift", "CP": "P(S|D)", "CPeTC": "P(S|D,C)"},
        inplace=True,
    )

    return result

conv2turns

conv2turns(conv_df: pd.DataFrame) -> List[Dict[str, Any]]

Take a conversation data frame and group it into conversation turns.

A turn is a group of messages sent by the same user and addressed to the same target.

Parameters:

Name	Type	Description	Default
`conv_df`	`DataFrame`	The conversation from where to obtain the conversation turns.	required

Returns:

Type	Description
`List[Dict[str, Any]]`	A list of dictionaries, each representing a conversation turn.

Source code in parshift/annotation.py

def conv2turns(conv_df: pd.DataFrame) -> List[Dict[str, Any]]:
    """Take a conversation data frame and group it into conversation turns.

    A turn is a group of messages sent by the same user and addressed to the
    same target.

    Arguments:
        conv_df: The conversation from where to obtain the conversation turns.

    Returns:
        A list of dictionaries, each representing a conversation turn.
    """

    conv_df = conv_df.reset_index()
    if "reply_to_id" in conv_df.columns:
        last_col = "reply_to_id"
    elif "target_id" in conv_df.columns:
        last_col = "target_id"

    conversation: List[Dict[str, Any]] = []
    turn = 0

    for index, row in conv_df.iterrows():
        # If the row being looped has the same "speaker_id" and the "last_col" value,
        # then merge the message text and message utterance_ids into the previous turn.

        if row[last_col] == "" or row[last_col] == "None":
            row[last_col] = None
        row[last_col] = int(float(row[last_col])) if row[last_col] != None else None

        if (
            index != 0
            and conversation[turn - 1]["speaker_id"] == row["speaker_id"]
            and conversation[turn - 1][last_col] == row[last_col]
        ):
            msg_join = ". ".join(
                [conversation[turn - 1]["utterance"], row["utterance"]]
            )
            list_id = conversation[turn - 1]["utterance_ids"] + [row["utterance_id"]]
            conversation[turn - 1]["utterance_ids"] = list_id
            conversation[turn - 1]["utterance"] = msg_join

        # Otherwise, create a new dictionary representing a new turn
        else:
            id = row["utterance_id"]
            speaker_id = row["speaker_id"]
            utterance = row["utterance"]
            last_col_val = row[last_col]

            conversation.append(
                {
                    "utterance_ids": [id],
                    "speaker_id": speaker_id,
                    "utterance": utterance,
                    last_col: last_col_val
                    if last_col_val != ""
                    and last_col_val != None
                    and last_col_val != "None"
                    else None,
                }
            )

            # Increment the turn counter
            turn += 1

    return conversation

frequency_treemap

frequency_treemap(
    cond_probs_df: pd.DataFrame,
    ax: Optional[matplotlib.axes.Axes] = None,
    type: str = "Pshift",
) -> matplotlib.axes.Axes

Get a matplotlib axes object displaying the conditional probabilities or frequencies.

Parameters:

Name	Type	Description	Default
`cond_probs_df`	`DataFrame`	Dataframe with information about the participation shift conditional probabilities. This data frame can be obtained with `cond_probs()`	required
`type`	`str`	Column name to be used to plot the treemap, either `"Pshift"` (default) or `"Pshift_class"`.	`'Pshift'`
`ax`	`Optional[Axes]`	Matplotlib axes with the treemap plot.	`None`

Returns:

Name	Type	Description
`ax`	`Axes`	Matplotlib axes with the participation shifts probabilities or frequency.

Source code in parshift/plotting.py

def frequency_treemap(
    cond_probs_df: pd.DataFrame,
    ax: Optional[matplotlib.axes.Axes] = None,
    type: str = "Pshift",
) -> matplotlib.axes.Axes:
    """Get a matplotlib axes object displaying the conditional probabilities or frequencies.

    Arguments:
        cond_probs_df: Dataframe with information about the participation shift
            conditional probabilities. This data frame can be obtained with
            [`cond_probs()`][parshift.statistics.cond_probs]
        type: Column name to be used to plot the treemap, either `"Pshift"`
            (default) or `"Pshift_class"`.
        ax: Matplotlib axes with the treemap plot.

    Returns:
        ax: Matplotlib axes with the participation shifts probabilities or frequency.
    """

    if not isinstance(type, str):
        raise TypeError("Parameter filename must be a String")
    if type not in ["Pshift_class", "Pshift"]:
        raise ValueError(
            "Parameter type must be one of the following: `Pshift`, `Pshift_class`"
        )

    if type == "Pshift_class":
        cond_probs_df["Pshift_class"] = cond_probs_df["Pshift"].apply(pshift_class)

    gb_parshift = cond_probs_df.groupby([type])["Frequency"].sum()

    data = [
        el
        for el in list(zip(gb_parshift.values, gb_parshift.index.values))
        if el[0] != 0
    ]
    labels = [
        f"{el} \n {round( 100 * (list(zip(*data))[0][idx] / sum(list(list(zip(*data))[0]))),1)}%"
        for idx, el in enumerate(list(zip(*data))[1])
    ]

    color_dict = {
        "Turn Receiving": "#86d87c",
        "AB-BA": "#86d87c",
        "AB-B0": "#c6ecbe",
        "AB-BY": "#7cd892",
        "Turn Claiming": "#f4b461",
        "A0-X0": "#f4b461",
        "A0-XA": "#fb9948",
        "A0-XY": "#efa107",
        "Turn Usurping": "#ff4d4d",
        "AB-X0": "#ff4d4d",
        "AB-XA": "#fb7477",
        "AB-XB": "#ef3b6e",
        "AB-XY": "#ef483b",
        "Turn Continuing": "#85eff9",
        "A0-AY": "#3b61ef",
        "AB-A0": "#85eff9",
        "AB-AY": "#b9befb",
    }

    colors = [color_dict[el] for el in list(zip(*data))[1]]

    if ax is None:
        _, ax = plt.subplots()

    squarify.plot(
        list(zip(*data))[0],
        label=labels,
        pad=2,
        color=colors,
        ax=ax,
    )
    # plt.title("Participation Shifts Frequency (%)")
    plt.axis("off")
    return ax

propensities

propensities(cond_probs_df: pd.DataFrame) -> pd.DataFrame

Determine the propensities from a conditional probabilities data frame.

Parameters:

Name	Type	Description	Default
`cond_probs_df`	`DataFrame`	A data frame with statistics obtained with `cond_probs()`.	required

Returns:

Type	Description
`DataFrame`	A data frame containing the propensities proposed by Gibson.

Source code in parshift/statistics.py

def propensities(cond_probs_df: pd.DataFrame) -> pd.DataFrame:
    """Determine the propensities from a conditional probabilities data frame.

    Arguments:
        cond_probs_df: A data frame with statistics obtained with
            [`cond_probs()`][parshift.statistics.cond_probs].

    Returns:
        A data frame containing the propensities proposed by Gibson.
    """

    dic_propensities = {}

    # turn-receiving propensity -> AB-BA, AB-BO, and AB-BY ( P(S|D) )
    p_s_d = cond_probs_df["P(S|D)"]
    p_s_d_c = cond_probs_df["P(S|D,C)"]

    dic_propensities["turn-receiving"] = p_s_d[4] + p_s_d[5] + p_s_d[10]

    # targeting propensity -> AO-XY, AB-BY and AB-XY ( P(S|D,C) )
    dic_propensities["targeting"] = p_s_d_c[2] + p_s_d_c[10] + p_s_d_c[11]

    # termination propensity -> AO-AY, AB-AO and AB-AY ( P(S|D) )
    dic_propensities["termination"] = p_s_d[2] + p_s_d[9] + p_s_d[12]

    return pd.DataFrame([dic_propensities])

pshift_class

pshift_class(pshift: str) -> str

Returns the participation shift class given a participation shift code.

Parameters:

Name	Type	Description	Default
`pshift`	`str`	Participation shift code (e.g A0-XA).	required

Returns:

Type	Description
`str`	Participation shift classe in given the participation shift code (either "Turn Receiving", "Turn Claiming", "Turn Usurping" or "Turn Continuing").

Source code in parshift/annotation.py

def pshift_class(pshift: str) -> str:
    """Returns the participation shift class given a participation shift code.

    Arguments:
        pshift: Participation shift code (e.g A0-XA).

    Returns:
        Participation shift classe in given the participation shift code (either
            "Turn Receiving", "Turn Claiming", "Turn Usurping" or  "Turn Continuing").
    """

    if not isinstance(pshift, str):
        raise TypeError("Parameter pshift_code must be a String")
    if not re.search("A[B|0]-[A|B|X][A|B|X|Y|0]", pshift):
        raise ValueError("Parameter pshift_code must be a parshift code. eg: AB-B0")

    return _p_shift_dict[pshift]

read_ccsv

read_ccsv(
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    **kwargs: Any
) -> pd.DataFrame

Read a conversation file in CSV format, validate it and return a data frame.

The conversation file should have the following columns:

utterance_id: ID of the message (int)
speaker_id: ID of the user sending the message (str)
utterance: The message itself (string)
reply_to_id or target_id: The reply ID or the target ID (int)

Parameters:

Name	Type	Description	Default
`filepath_or_buffer`	`FilePath \| ReadCsvBuffer[bytes] \| ReadCsvBuffer[str]`	Any valid string path to CSV file, as accepted by Pandas `read_csv()` function.	required
`**kwargs`	`Any`	Keyword parameters passed to Pandas `read_csv()` function.	`{}`

Returns:

Type	Description
`DataFrame`	A Pandas `DataFrame` containing the validated conversation.

Source code in parshift/annotation.py

def read_ccsv(
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    **kwargs: Any,
) -> pd.DataFrame:
    """Read a conversation file in CSV format, validate it and return a data frame.

    The conversation file should have the following columns:

    - `utterance_id`: ID of the message (int)
    - `speaker_id`: ID of the user sending the message (str)
    - `utterance`: The message itself (string)
    - `reply_to_id` or `target_id`: The reply ID or the target ID (int)

    Arguments:
        filepath_or_buffer: Any valid string path to CSV file, as accepted by
            Pandas [`read_csv()`][pandas.read_csv] function.
        **kwargs: Keyword parameters passed to Pandas
            [`read_csv()`][pandas.read_csv] function.

    Returns:
        A Pandas [`DataFrame`][pandas.DataFrame] containing the validated
            conversation.
    """

    # Read the conversation file
    conversation: pd.DataFrame = pd.read_csv(filepath_or_buffer, dtype=_p_shift_cols, **kwargs)  # type: ignore

    # Obtain potentially missing columns
    missing = _p_shift_cols.keys() - conversation.columns

    # Check if we have missing columns
    if (
        len(missing) == 1
        and "reply_to_id" not in missing
        and "target_id" not in missing
    ):
        # If only one column missing, it can't be other than `reply_to_id` or `target_id`
        raise ValueError(f"CSV file is missing the `{missing.pop()}` column")
    elif len(missing) > 1:
        # If more than one column missing, we have a problem
        raise ValueError(f"CSV file is missing the `{'`, `'.join(missing)}` columns")

    # Change Nan values to empty strings in the `reply_to_id` or `target_id` column
    if "reply_to_id" in conversation.columns:
        conversation["reply_to_id"] = conversation["reply_to_id"].fillna("")
    else:
        conversation["target_id"] = conversation["target_id"].fillna("")

    return conversation