Skip to content

Reference

API reference for the functions exported by ParShift.

Parshift

Source code in parshift/oo_parshift.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
class Parshift:
    def __init__(
        self,
        annotation: pd.DataFrame | None = None,
        stats: pd.DataFrame | List[pd.DataFrame] | None = None,
    ):
        """Parshift initialization"""

        self.annotation = annotation
        self.stats = stats

    def process(
        self,
        filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
        N: int = 1,
        **kwargs: Any,
    ):
        """Read a conversation file in CSV format, validate it,
        get Gibson's participation shift codes from turns in a conversation,
        determine the conditional probabilities for a sequence of participation shift codes
        and return a dict with parshift annotations and conditional probabilities.

        The conversation file should have the following columns:

        - `utterance_id`: ID of the message (int)
        - `speaker_id`: ID of the user sending the message (str)
        - `utterance`: The message itself (string)
        - `reply_to_id` or `target_id`: The reply ID or the target ID (int)

        Arguments:
            filepath_or_buffer: Any valid string path to CSV file, as accepted by
                Pandas [`read_csv()`][pandas.read_csv] function.
            N: Number of parts to split the conversation into. Default is 1 (all conversation).
                `N` should be between 1 and 4.
            **kwargs: Keyword parameters passed to Pandas
                [`read_csv()`][pandas.read_csv] function.

        - Parshift.annotation will be data frame equal as returned by [`annotate()`][parshift.annotation.annotate].
        - Parshift.stats will be data frame equal as returned by [`cond_probs()`][parshift.statistics.cond_probs].
        """

        df_annotate = annotate(read_ccsv(filepath_or_buffer, **kwargs))
        self.annotation = df_annotate

        if N == 1:
            self.stats = cond_probs(df_annotate)
        elif N in [2, 3, 4]:
            list_stats = []
            size = len(df_annotate)
            parts = size / N
            for i in range(N):
                # Get all the rows from parts*i to size*(i+1) with all columns
                start = int(parts * i)
                end = int(parts * (i + 1))
                list_stats.append(cond_probs(df_annotate.iloc[start:end, :]))
            self.stats = list_stats
        else:
            raise ValueError("N should be between 1 and 4.")

    def show_plot(self, type: str = "Pshift", filename: str | None = None):
        """Shows the frequency treemap plot returned by [`frequency_treemap()`][parshift.plotting.frequency_treemap]

        Arguments:
            type: Column name to be used to plot the treemap, either `"Pshift"`
                (default) or `"Pshift_class"`.
            filename: Name of the file to save the plot. Default to `None` .

        """

        if self.stats is None:
            raise ValueError(
                "Parshift.stats is None. Please run Parshift.process() first."
            )

        if not isinstance(type, str):
            raise TypeError("Parameter type must be a String")
        if type not in ["Pshift_class", "Pshift"]:
            raise ValueError(
                "Parameter type must be one of the following: `Pshift`, `Pshift_class`"
            )

        if filename != None and not isinstance(filename, str):
            raise TypeError("Parameter filename must be a String")

        if type == "Pshift":
            if isinstance(self.stats, list):
                _, ax = plt.subplots(
                    1, len(self.stats), figsize=(5 * len(self.stats), 5)
                )

                for i in range(len(self.stats)):
                    frequency_treemap(self.stats[i], type=type, ax=ax[i])
                    ax[i].axis("off")
                    ax[i].set_title(f"n {i+1}")
            else:
                ax = frequency_treemap(self.stats, type=type)

            plt.suptitle("Participation-Shift Frequencies")

        elif type == "Pshift_class":
            if isinstance(self.stats, list):
                _, ax = plt.subplots(
                    1, len(self.stats), figsize=(5 * len(self.stats), 5)
                )

                for i in range(len(self.stats)):
                    frequency_treemap(self.stats[i], type=type, ax=ax[i])
                    ax[i].axis("off")
                    ax[i].set_title(f"n {i+1}")
            else:
                ax = frequency_treemap(self.stats, type=type)

            plt.suptitle("Participation Shifts: Class Proportions")

        if filename:
            if ".png" not in filename:
                filename += ".png"
            plt.savefig(filename, dpi=300)

        plt.show()

    def show_stats(self, filename: str | None = None):
        """Prints the stats returned by [`cond_probs()`][parshift.statistics.cond_probs]
        Dataframe. If kwarg N (see [`process`][parshift.Parshift.process]) > 1, prints N data frames.

        Arguments:
            filename: Name of the file (csv) to save the stats data frame. Default to `None`.
        """

        if self.stats is None:
            raise ValueError(
                "Parshift.stats is None. Please run Parshift.process() first."
            )

        if isinstance(self.stats, list):
            for i in range(len(self.stats)):
                print(f"n{i+1}:")
                print(self.stats[i])
                print("-" * 80)

                if filename:
                    if ".csv" not in filename:
                        filename_changed = f"{filename}_n{i+1}.csv"
                    else:
                        filename_changed = filename.replace(".csv", f"_n{i+1}.csv")
                    self.stats[i].to_csv(filename_changed, index=False)

        else:
            print(self.stats)
            if filename:
                if ".csv" not in filename:
                    filename += ".csv"
                self.stats.to_csv(filename, index=False)

    def get_propensities(self, filename: str | None = None) -> pd.DataFrame:
        """Returns a data frame with the Participation Shift propensities.

        Arguments:
            filename: Name of the file (csv) to save the propensities data frame. Default to `None`.

        Returns:
            A Pandas [`DataFrame`][pandas.DataFrame] containing the propensities.
        """

        if self.stats is None:
            raise ValueError(
                "Parshift.stats is None. Please run Parshift.process() first."
            )

        if isinstance(self.stats, list):
            df = propensities(self.stats[0])
            df.index = ["n1"]  # type: ignore
            for i in range(1, len(self.stats)):
                dfx = propensities(self.stats[i])
                dfx.index = [f"n{i+1}"]  # type: ignore
                df = pd.concat([df, dfx])

            if filename:
                if ".csv" not in filename:
                    filename += ".csv"
                df.to_csv(filename, index=False)
            return df

        else:
            df = propensities(self.stats)
            df.index = ["n"]  # type: ignore

            if filename:
                if ".csv" not in filename:
                    filename += ".csv"
                df.to_csv(filename, index=False)
            return df

__init__

__init__(
    annotation: pd.DataFrame | None = None,
    stats: pd.DataFrame | List[pd.DataFrame] | None = None,
)

Parshift initialization

Source code in parshift/oo_parshift.py
19
20
21
22
23
24
25
26
27
def __init__(
    self,
    annotation: pd.DataFrame | None = None,
    stats: pd.DataFrame | List[pd.DataFrame] | None = None,
):
    """Parshift initialization"""

    self.annotation = annotation
    self.stats = stats

get_propensities

get_propensities(filename: str | None = None) -> pd.DataFrame

Returns a data frame with the Participation Shift propensities.

Parameters:

Name Type Description Default
filename str | None

Name of the file (csv) to save the propensities data frame. Default to None.

None

Returns:

Type Description
DataFrame

A Pandas DataFrame containing the propensities.

Source code in parshift/oo_parshift.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
def get_propensities(self, filename: str | None = None) -> pd.DataFrame:
    """Returns a data frame with the Participation Shift propensities.

    Arguments:
        filename: Name of the file (csv) to save the propensities data frame. Default to `None`.

    Returns:
        A Pandas [`DataFrame`][pandas.DataFrame] containing the propensities.
    """

    if self.stats is None:
        raise ValueError(
            "Parshift.stats is None. Please run Parshift.process() first."
        )

    if isinstance(self.stats, list):
        df = propensities(self.stats[0])
        df.index = ["n1"]  # type: ignore
        for i in range(1, len(self.stats)):
            dfx = propensities(self.stats[i])
            dfx.index = [f"n{i+1}"]  # type: ignore
            df = pd.concat([df, dfx])

        if filename:
            if ".csv" not in filename:
                filename += ".csv"
            df.to_csv(filename, index=False)
        return df

    else:
        df = propensities(self.stats)
        df.index = ["n"]  # type: ignore

        if filename:
            if ".csv" not in filename:
                filename += ".csv"
            df.to_csv(filename, index=False)
        return df

process

process(
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    N: int = 1,
    **kwargs: Any
)

Read a conversation file in CSV format, validate it, get Gibson's participation shift codes from turns in a conversation, determine the conditional probabilities for a sequence of participation shift codes and return a dict with parshift annotations and conditional probabilities.

The conversation file should have the following columns:

  • utterance_id: ID of the message (int)
  • speaker_id: ID of the user sending the message (str)
  • utterance: The message itself (string)
  • reply_to_id or target_id: The reply ID or the target ID (int)

Parameters:

Name Type Description Default
filepath_or_buffer FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]

Any valid string path to CSV file, as accepted by Pandas read_csv() function.

required
N int

Number of parts to split the conversation into. Default is 1 (all conversation). N should be between 1 and 4.

1
**kwargs Any

Keyword parameters passed to Pandas read_csv() function.

{}
  • Parshift.annotation will be data frame equal as returned by annotate().
  • Parshift.stats will be data frame equal as returned by cond_probs().
Source code in parshift/oo_parshift.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def process(
    self,
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    N: int = 1,
    **kwargs: Any,
):
    """Read a conversation file in CSV format, validate it,
    get Gibson's participation shift codes from turns in a conversation,
    determine the conditional probabilities for a sequence of participation shift codes
    and return a dict with parshift annotations and conditional probabilities.

    The conversation file should have the following columns:

    - `utterance_id`: ID of the message (int)
    - `speaker_id`: ID of the user sending the message (str)
    - `utterance`: The message itself (string)
    - `reply_to_id` or `target_id`: The reply ID or the target ID (int)

    Arguments:
        filepath_or_buffer: Any valid string path to CSV file, as accepted by
            Pandas [`read_csv()`][pandas.read_csv] function.
        N: Number of parts to split the conversation into. Default is 1 (all conversation).
            `N` should be between 1 and 4.
        **kwargs: Keyword parameters passed to Pandas
            [`read_csv()`][pandas.read_csv] function.

    - Parshift.annotation will be data frame equal as returned by [`annotate()`][parshift.annotation.annotate].
    - Parshift.stats will be data frame equal as returned by [`cond_probs()`][parshift.statistics.cond_probs].
    """

    df_annotate = annotate(read_ccsv(filepath_or_buffer, **kwargs))
    self.annotation = df_annotate

    if N == 1:
        self.stats = cond_probs(df_annotate)
    elif N in [2, 3, 4]:
        list_stats = []
        size = len(df_annotate)
        parts = size / N
        for i in range(N):
            # Get all the rows from parts*i to size*(i+1) with all columns
            start = int(parts * i)
            end = int(parts * (i + 1))
            list_stats.append(cond_probs(df_annotate.iloc[start:end, :]))
        self.stats = list_stats
    else:
        raise ValueError("N should be between 1 and 4.")

show_plot

show_plot(type: str = 'Pshift', filename: str | None = None)

Shows the frequency treemap plot returned by frequency_treemap()

Parameters:

Name Type Description Default
type str

Column name to be used to plot the treemap, either "Pshift" (default) or "Pshift_class".

'Pshift'
filename str | None

Name of the file to save the plot. Default to None .

None
Source code in parshift/oo_parshift.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def show_plot(self, type: str = "Pshift", filename: str | None = None):
    """Shows the frequency treemap plot returned by [`frequency_treemap()`][parshift.plotting.frequency_treemap]

    Arguments:
        type: Column name to be used to plot the treemap, either `"Pshift"`
            (default) or `"Pshift_class"`.
        filename: Name of the file to save the plot. Default to `None` .

    """

    if self.stats is None:
        raise ValueError(
            "Parshift.stats is None. Please run Parshift.process() first."
        )

    if not isinstance(type, str):
        raise TypeError("Parameter type must be a String")
    if type not in ["Pshift_class", "Pshift"]:
        raise ValueError(
            "Parameter type must be one of the following: `Pshift`, `Pshift_class`"
        )

    if filename != None and not isinstance(filename, str):
        raise TypeError("Parameter filename must be a String")

    if type == "Pshift":
        if isinstance(self.stats, list):
            _, ax = plt.subplots(
                1, len(self.stats), figsize=(5 * len(self.stats), 5)
            )

            for i in range(len(self.stats)):
                frequency_treemap(self.stats[i], type=type, ax=ax[i])
                ax[i].axis("off")
                ax[i].set_title(f"n {i+1}")
        else:
            ax = frequency_treemap(self.stats, type=type)

        plt.suptitle("Participation-Shift Frequencies")

    elif type == "Pshift_class":
        if isinstance(self.stats, list):
            _, ax = plt.subplots(
                1, len(self.stats), figsize=(5 * len(self.stats), 5)
            )

            for i in range(len(self.stats)):
                frequency_treemap(self.stats[i], type=type, ax=ax[i])
                ax[i].axis("off")
                ax[i].set_title(f"n {i+1}")
        else:
            ax = frequency_treemap(self.stats, type=type)

        plt.suptitle("Participation Shifts: Class Proportions")

    if filename:
        if ".png" not in filename:
            filename += ".png"
        plt.savefig(filename, dpi=300)

    plt.show()

show_stats

show_stats(filename: str | None = None)

Prints the stats returned by cond_probs() Dataframe. If kwarg N (see process) > 1, prints N data frames.

Parameters:

Name Type Description Default
filename str | None

Name of the file (csv) to save the stats data frame. Default to None.

None
Source code in parshift/oo_parshift.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def show_stats(self, filename: str | None = None):
    """Prints the stats returned by [`cond_probs()`][parshift.statistics.cond_probs]
    Dataframe. If kwarg N (see [`process`][parshift.Parshift.process]) > 1, prints N data frames.

    Arguments:
        filename: Name of the file (csv) to save the stats data frame. Default to `None`.
    """

    if self.stats is None:
        raise ValueError(
            "Parshift.stats is None. Please run Parshift.process() first."
        )

    if isinstance(self.stats, list):
        for i in range(len(self.stats)):
            print(f"n{i+1}:")
            print(self.stats[i])
            print("-" * 80)

            if filename:
                if ".csv" not in filename:
                    filename_changed = f"{filename}_n{i+1}.csv"
                else:
                    filename_changed = filename.replace(".csv", f"_n{i+1}.csv")
                self.stats[i].to_csv(filename_changed, index=False)

    else:
        print(self.stats)
        if filename:
            if ".csv" not in filename:
                filename += ".csv"
            self.stats.to_csv(filename, index=False)

annotate

annotate(conv_df: pd.DataFrame) -> pd.DataFrame

Get Gibson's participation shift codes from turns in a conversation.

Sequences of messages from a speaker to the same addressee are considered to be in the same turn, and therefore will be assigned a single participation shift code.

Parameters:

Name Type Description Default
conv_df DataFrame

The conversation from where to obtain the participation shift codes.

required

Returns:

Type Description
DataFrame

A data frame with the participation shift codes for each turn.

Source code in parshift/annotation.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
def annotate(conv_df: pd.DataFrame) -> pd.DataFrame:
    """Get Gibson's participation shift codes from turns in a conversation.

    Sequences of messages from a speaker to the same addressee are considered to
    be in the same turn, and therefore will be assigned a single participation
    shift code.

    Arguments:
        conv_df: The conversation from where to obtain the participation shift codes.

    Returns:
        A data frame with the participation shift codes for each turn.
    """

    if not isinstance(conv_df, pd.DataFrame):
        raise TypeError("Parameter conv_df must be a Pandas DataFrame")

    conversation = conv2turns(conv_df)

    # part1 will take the parshift label for the previous turn
    part_1 = ""

    # part2 will take the parshift label for the current turn
    part_2 = ""

    if "reply_to_id" in conv_df.columns:
        annotate_df = pd.DataFrame(
            {
                "utterance_ids": [],
                "speaker_id": [],
                "utterance": [],
                "reply_to_id": [],
                "label_desc": [],
                "pshift": [],
            }
        )

        # calculate the participation shift for each turn
        for idx, msg in enumerate(conversation):
            if (
                msg["reply_to_id"] == None
                or msg["reply_to_id"] == "None"
                or msg["reply_to_id"] == ""
            ):
                part_2 = " " + str(msg["speaker_id"]) + " to group"
            else:
                for msgPrev in conversation[: idx + 1]:
                    if msg["reply_to_id"] in msgPrev["utterance_ids"]:
                        if (
                            msgPrev["reply_to_id"] == None
                            or msgPrev["reply_to_id"] == "None"
                            or msgPrev["reply_to_id"] == ""
                        ):
                            part_1 = str(msgPrev["speaker_id"]) + " to group,"

                        else:  # reply - reply
                            for msgPrev2 in conversation[:idx]:
                                if msgPrev["reply_to_id"] in msgPrev2["utterance_ids"]:
                                    part_1 = (
                                        str(msgPrev["speaker_id"])
                                        + " to "
                                        + str(msgPrev2["speaker_id"])
                                        + ","
                                    )

                        part_2 = (
                            " "
                            + str(msg["speaker_id"])
                            + " to "
                            + str(msgPrev["speaker_id"])
                        )

            # p1p2 takes the parshift label for the previous + current turn
            p1p2 = part_1 + part_2

            # part_1 takes the part_2 label for the next iteration
            part_1 = part_2[1:] + ","

            # set value to "" for first turn
            pshift_label = ""

            # we cannot calculate the pshift for the first turn
            if idx != 0:
                pshift_label = _pshift_code(p1p2)

            annotate_df.loc[len(annotate_df.index)] = [  # type: ignore
                str(msg["utterance_ids"]),
                str(msg["speaker_id"]),
                msg["utterance"],
                str(msg["reply_to_id"]),
                p1p2,
                pshift_label,
            ]

    elif "target_id" in conv_df.columns:
        annotate_df = pd.DataFrame(
            {
                "utterance_ids": [],
                "speaker_id": [],
                "utterance": [],
                "target_id": [],
                "label_desc": [],
                "pshift": [],
            }
        )

        # calculate the participation shift for each turn
        for idx, msg in enumerate(conversation):
            # if msg has no target, it is directed to the group
            if (
                msg["target_id"] == None
                or msg["target_id"] == "None"
                or msg["target_id"] == ""
            ):
                part_2 = " " + str(msg["speaker_id"]) + " to group"

            # if msg has a target, we save it
            else:
                part_2 = " " + str(msg["speaker_id"]) + " to " + str(msg["target_id"])

            # p1p2 takes the parshift label for the previous + current turn
            p1p2 = part_1 + part_2

            # part_1 takes the part_2 label for the next iteration
            part_1 = part_2[1:] + ","

            # set value to "" for first turn
            pshift_label = ""

            # we cannot calculate the pshift for the first turn
            if idx != 0:
                msg["label"] = p1p2
                pshift_label = _pshift_code(p1p2)
                msg["pshift"] = pshift_label

            annotate_df.loc[len(annotate_df.index)] = [  # type: ignore
                str(msg["utterance_ids"]),
                str(msg["speaker_id"]),
                msg["utterance"],
                str(msg["target_id"]),
                p1p2,
                pshift_label,
            ]

    annotate_df.drop(columns=["label_desc"], inplace=True)

    return annotate_df

cond_probs

cond_probs(pshift_codes: pd.DataFrame) -> pd.DataFrame

Determine the conditional probabilities for a sequence of participation shift codes.

Parameters:

Name Type Description Default
pshift_codes DataFrame

A sequence of participation shift code obtained with annotate().

required

Returns:

Type Description
DataFrame

A data frame containing the frequency, probability and conditional probabilities (two) for each parshift code. This data frame is divided into two 'subgroups': (1) those beginning with an undirected remark (A0-); and, (2) those beginning with a directed one (AB-). The P(S|D) (Probability of a participation shift given a Directed or Undirected remark (D)) column contains the frequency divided by total occurrences in each subgroup, while the P(S|D,C) (Probability of a participation shift given a Directed or Undirected remark (D) and assuming Change of Speaker (C)) column contains the frequency divided by total occurrences in each subgroup, for each participation shift where the change of speaker occurs.

Source code in parshift/statistics.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def cond_probs(pshift_codes: pd.DataFrame) -> pd.DataFrame:
    """Determine the conditional probabilities for a sequence of participation shift codes.

    Arguments:
        pshift_codes: A sequence of participation shift code obtained with
            [`annotate()`][parshift.annotation.annotate].

    Returns:
        A data frame containing the frequency, probability and conditional probabilities
            (two) for each parshift code. This data frame is divided into two 'subgroups':
            (1) those beginning with an undirected remark (A0-); and, (2) those beginning
            with a directed one (AB-). The `P(S|D)` (Probability of a participation shift
            given a Directed or Undirected remark (D)) column contains the frequency divided
            by total occurrences in each subgroup, while the `P(S|D,C)` (Probability of
            a participation shift given a Directed or Undirected remark (D) and assuming
            Change of Speaker (C)) column contains the frequency divided by total occurrences
            in each subgroup, for each participation shift where the change of speaker occurs.
    """

    if not isinstance(pshift_codes, pd.DataFrame):
        raise TypeError("Parameter parshift_annotation_df must be a Dataframe")

    frequency_table_and_counts = _frequency_table(pshift_codes)
    freq_table = frequency_table_and_counts[0]

    cond_prob = {}
    for key in freq_table:
        if key.split("-")[0] == "A0":
            if key not in ["A0-AY", "AB-A0", "AB-AY", "A0-A0"]:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[1], 2)
                    if frequency_table_and_counts[1] != 0
                    else 0,
                    "CPeTC": round(freq_table[key] / frequency_table_and_counts[3], 2)
                    if frequency_table_and_counts[3] != 0
                    else 0,
                }
            else:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[1], 2)
                    if frequency_table_and_counts[1] != 0
                    else 0,
                    "CPeTC": "",
                }
        else:
            if key not in ["A0-AY", "AB-A0", "AB-AY", "A0-A0"]:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[2], 2)
                    if frequency_table_and_counts[2] != 0
                    else 0,
                    "CPeTC": round(freq_table[key] / frequency_table_and_counts[4], 2)
                    if frequency_table_and_counts[4] != 0
                    else 0,
                }
            else:
                cond_prob[key] = {
                    "CP": round(freq_table[key] / frequency_table_and_counts[2], 2)
                    if frequency_table_and_counts[2] != 0
                    else 0,
                    "CPeTC": "",
                }

    cond_prob_df = pd.DataFrame.from_dict(cond_prob, orient="index")
    freq = pd.DataFrame.from_dict(freq_table, orient="index", columns=["Frequency"])
    freq["Probability"] = round(freq["Frequency"] / freq["Frequency"].sum(), 2)

    result = (
        pd.concat([freq, cond_prob_df], axis=1)
        .reset_index()
        .rename(columns={"index": "pshift"})
    )

    result = result.sort_values(
        by=["pshift"], key=lambda x: x.map(_cp_order)
    ).reset_index(drop=True)

    result = result.iloc[:, [0, 1, 2, 3, 4]]

    result["Change of Speaker (C)"] = result["pshift"].apply(
        lambda ps: _change_of_speaker(ps)
    )

    result["Directed Remark (D)"] = result["pshift"].apply(
        lambda ps: _targeted_remark(ps)
    )

    result.rename(
        columns={"pshift": "Pshift", "CP": "P(S|D)", "CPeTC": "P(S|D,C)"},
        inplace=True,
    )

    return result

conv2turns

conv2turns(conv_df: pd.DataFrame) -> List[Dict[str, Any]]

Take a conversation data frame and group it into conversation turns.

A turn is a group of messages sent by the same user and addressed to the same target.

Parameters:

Name Type Description Default
conv_df DataFrame

The conversation from where to obtain the conversation turns.

required

Returns:

Type Description
List[Dict[str, Any]]

A list of dictionaries, each representing a conversation turn.

Source code in parshift/annotation.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def conv2turns(conv_df: pd.DataFrame) -> List[Dict[str, Any]]:
    """Take a conversation data frame and group it into conversation turns.

    A turn is a group of messages sent by the same user and addressed to the
    same target.

    Arguments:
        conv_df: The conversation from where to obtain the conversation turns.

    Returns:
        A list of dictionaries, each representing a conversation turn.
    """

    conv_df = conv_df.reset_index()
    if "reply_to_id" in conv_df.columns:
        last_col = "reply_to_id"
    elif "target_id" in conv_df.columns:
        last_col = "target_id"

    conversation: List[Dict[str, Any]] = []
    turn = 0

    for index, row in conv_df.iterrows():
        # If the row being looped has the same "speaker_id" and the "last_col" value,
        # then merge the message text and message utterance_ids into the previous turn.

        if row[last_col] == "" or row[last_col] == "None":
            row[last_col] = None
        row[last_col] = int(float(row[last_col])) if row[last_col] != None else None

        if (
            index != 0
            and conversation[turn - 1]["speaker_id"] == row["speaker_id"]
            and conversation[turn - 1][last_col] == row[last_col]
        ):
            msg_join = ". ".join(
                [conversation[turn - 1]["utterance"], row["utterance"]]
            )
            list_id = conversation[turn - 1]["utterance_ids"] + [row["utterance_id"]]
            conversation[turn - 1]["utterance_ids"] = list_id
            conversation[turn - 1]["utterance"] = msg_join

        # Otherwise, create a new dictionary representing a new turn
        else:
            id = row["utterance_id"]
            speaker_id = row["speaker_id"]
            utterance = row["utterance"]
            last_col_val = row[last_col]

            conversation.append(
                {
                    "utterance_ids": [id],
                    "speaker_id": speaker_id,
                    "utterance": utterance,
                    last_col: last_col_val
                    if last_col_val != ""
                    and last_col_val != None
                    and last_col_val != "None"
                    else None,
                }
            )

            # Increment the turn counter
            turn += 1

    return conversation

frequency_treemap

frequency_treemap(
    cond_probs_df: pd.DataFrame,
    ax: Optional[matplotlib.axes.Axes] = None,
    type: str = "Pshift",
) -> matplotlib.axes.Axes

Get a matplotlib axes object displaying the conditional probabilities or frequencies.

Parameters:

Name Type Description Default
cond_probs_df DataFrame

Dataframe with information about the participation shift conditional probabilities. This data frame can be obtained with cond_probs()

required
type str

Column name to be used to plot the treemap, either "Pshift" (default) or "Pshift_class".

'Pshift'
ax Optional[Axes]

Matplotlib axes with the treemap plot.

None

Returns:

Name Type Description
ax Axes

Matplotlib axes with the participation shifts probabilities or frequency.

Source code in parshift/plotting.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def frequency_treemap(
    cond_probs_df: pd.DataFrame,
    ax: Optional[matplotlib.axes.Axes] = None,
    type: str = "Pshift",
) -> matplotlib.axes.Axes:
    """Get a matplotlib axes object displaying the conditional probabilities or frequencies.

    Arguments:
        cond_probs_df: Dataframe with information about the participation shift
            conditional probabilities. This data frame can be obtained with
            [`cond_probs()`][parshift.statistics.cond_probs]
        type: Column name to be used to plot the treemap, either `"Pshift"`
            (default) or `"Pshift_class"`.
        ax: Matplotlib axes with the treemap plot.

    Returns:
        ax: Matplotlib axes with the participation shifts probabilities or frequency.
    """

    if not isinstance(type, str):
        raise TypeError("Parameter filename must be a String")
    if type not in ["Pshift_class", "Pshift"]:
        raise ValueError(
            "Parameter type must be one of the following: `Pshift`, `Pshift_class`"
        )

    if type == "Pshift_class":
        cond_probs_df["Pshift_class"] = cond_probs_df["Pshift"].apply(pshift_class)

    gb_parshift = cond_probs_df.groupby([type])["Frequency"].sum()

    data = [
        el
        for el in list(zip(gb_parshift.values, gb_parshift.index.values))
        if el[0] != 0
    ]
    labels = [
        f"{el} \n {round( 100 * (list(zip(*data))[0][idx] / sum(list(list(zip(*data))[0]))),1)}%"
        for idx, el in enumerate(list(zip(*data))[1])
    ]

    color_dict = {
        "Turn Receiving": "#86d87c",
        "AB-BA": "#86d87c",
        "AB-B0": "#c6ecbe",
        "AB-BY": "#7cd892",
        "Turn Claiming": "#f4b461",
        "A0-X0": "#f4b461",
        "A0-XA": "#fb9948",
        "A0-XY": "#efa107",
        "Turn Usurping": "#ff4d4d",
        "AB-X0": "#ff4d4d",
        "AB-XA": "#fb7477",
        "AB-XB": "#ef3b6e",
        "AB-XY": "#ef483b",
        "Turn Continuing": "#85eff9",
        "A0-AY": "#3b61ef",
        "AB-A0": "#85eff9",
        "AB-AY": "#b9befb",
    }

    colors = [color_dict[el] for el in list(zip(*data))[1]]

    if ax is None:
        _, ax = plt.subplots()

    squarify.plot(
        list(zip(*data))[0],
        label=labels,
        pad=2,
        color=colors,
        ax=ax,
    )
    # plt.title("Participation Shifts Frequency (%)")
    plt.axis("off")
    return ax

propensities

propensities(cond_probs_df: pd.DataFrame) -> pd.DataFrame

Determine the propensities from a conditional probabilities data frame.

Parameters:

Name Type Description Default
cond_probs_df DataFrame

A data frame with statistics obtained with cond_probs().

required

Returns:

Type Description
DataFrame

A data frame containing the propensities proposed by Gibson.

Source code in parshift/statistics.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def propensities(cond_probs_df: pd.DataFrame) -> pd.DataFrame:
    """Determine the propensities from a conditional probabilities data frame.

    Arguments:
        cond_probs_df: A data frame with statistics obtained with
            [`cond_probs()`][parshift.statistics.cond_probs].

    Returns:
        A data frame containing the propensities proposed by Gibson.
    """

    dic_propensities = {}

    # turn-receiving propensity -> AB-BA, AB-BO, and AB-BY ( P(S|D) )
    p_s_d = cond_probs_df["P(S|D)"]
    p_s_d_c = cond_probs_df["P(S|D,C)"]

    dic_propensities["turn-receiving"] = p_s_d[4] + p_s_d[5] + p_s_d[10]

    # targeting propensity -> AO-XY, AB-BY and AB-XY ( P(S|D,C) )
    dic_propensities["targeting"] = p_s_d_c[2] + p_s_d_c[10] + p_s_d_c[11]

    # termination propensity -> AO-AY, AB-AO and AB-AY ( P(S|D) )
    dic_propensities["termination"] = p_s_d[2] + p_s_d[9] + p_s_d[12]

    return pd.DataFrame([dic_propensities])

pshift_class

pshift_class(pshift: str) -> str

Returns the participation shift class given a participation shift code.

Parameters:

Name Type Description Default
pshift str

Participation shift code (e.g A0-XA).

required

Returns:

Type Description
str

Participation shift classe in given the participation shift code (either "Turn Receiving", "Turn Claiming", "Turn Usurping" or "Turn Continuing").

Source code in parshift/annotation.py
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def pshift_class(pshift: str) -> str:
    """Returns the participation shift class given a participation shift code.

    Arguments:
        pshift: Participation shift code (e.g A0-XA).

    Returns:
        Participation shift classe in given the participation shift code (either
            "Turn Receiving", "Turn Claiming", "Turn Usurping" or  "Turn Continuing").
    """

    if not isinstance(pshift, str):
        raise TypeError("Parameter pshift_code must be a String")
    if not re.search("A[B|0]-[A|B|X][A|B|X|Y|0]", pshift):
        raise ValueError("Parameter pshift_code must be a parshift code. eg: AB-B0")

    return _p_shift_dict[pshift]

read_ccsv

read_ccsv(
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    **kwargs: Any
) -> pd.DataFrame

Read a conversation file in CSV format, validate it and return a data frame.

The conversation file should have the following columns:

  • utterance_id: ID of the message (int)
  • speaker_id: ID of the user sending the message (str)
  • utterance: The message itself (string)
  • reply_to_id or target_id: The reply ID or the target ID (int)

Parameters:

Name Type Description Default
filepath_or_buffer FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]

Any valid string path to CSV file, as accepted by Pandas read_csv() function.

required
**kwargs Any

Keyword parameters passed to Pandas read_csv() function.

{}

Returns:

Type Description
DataFrame

A Pandas DataFrame containing the validated conversation.

Source code in parshift/annotation.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def read_ccsv(
    filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
    **kwargs: Any,
) -> pd.DataFrame:
    """Read a conversation file in CSV format, validate it and return a data frame.

    The conversation file should have the following columns:

    - `utterance_id`: ID of the message (int)
    - `speaker_id`: ID of the user sending the message (str)
    - `utterance`: The message itself (string)
    - `reply_to_id` or `target_id`: The reply ID or the target ID (int)

    Arguments:
        filepath_or_buffer: Any valid string path to CSV file, as accepted by
            Pandas [`read_csv()`][pandas.read_csv] function.
        **kwargs: Keyword parameters passed to Pandas
            [`read_csv()`][pandas.read_csv] function.

    Returns:
        A Pandas [`DataFrame`][pandas.DataFrame] containing the validated
            conversation.
    """

    # Read the conversation file
    conversation: pd.DataFrame = pd.read_csv(filepath_or_buffer, dtype=_p_shift_cols, **kwargs)  # type: ignore

    # Obtain potentially missing columns
    missing = _p_shift_cols.keys() - conversation.columns

    # Check if we have missing columns
    if (
        len(missing) == 1
        and "reply_to_id" not in missing
        and "target_id" not in missing
    ):
        # If only one column missing, it can't be other than `reply_to_id` or `target_id`
        raise ValueError(f"CSV file is missing the `{missing.pop()}` column")
    elif len(missing) > 1:
        # If more than one column missing, we have a problem
        raise ValueError(f"CSV file is missing the `{'`, `'.join(missing)}` columns")

    # Change Nan values to empty strings in the `reply_to_id` or `target_id` column
    if "reply_to_id" in conversation.columns:
        conversation["reply_to_id"] = conversation["reply_to_id"].fillna("")
    else:
        conversation["target_id"] = conversation["target_id"].fillna("")

    return conversation