Skip to content

Commit

Permalink
[SPARK-46635][PYTHON][DOCS] Refine docstring of `from_csv/schema_of_c…
Browse files Browse the repository at this point in the history
…sv/to_csv`

### What changes were proposed in this pull request?
This pr refine docstring of  `from_csv/schema_of_csv/to_csv` and add some new examples.

### Why are the changes needed?
To improve PySpark documentation

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Pass Github Actions

### Was this patch authored or co-authored using generative AI tooling?
No

Closes apache#44639 from LuciferYang/csv-functions.

Authored-by: yangjie01 <[email protected]>
Signed-off-by: yangjie01 <[email protected]>
  • Loading branch information
LuciferYang committed Jan 10, 2024
1 parent fcdfc8c commit bda9957
Showing 1 changed file with 167 additions and 28 deletions.
195 changes: 167 additions & 28 deletions python/pyspark/sql/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -14925,7 +14925,7 @@ def to_xml(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col
@_try_remote_functions
def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = None) -> Column:
"""
Parses a CSV string and infers its schema in DDL format.
CSV Function: Parses a CSV string and infers its schema in DDL format.

.. versionadded:: 3.0.0

Expand All @@ -14935,9 +14935,9 @@ def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = N
Parameters
----------
csv : :class:`~pyspark.sql.Column` or str
a CSV string or a foldable string column containing a CSV string.
A CSV string or a foldable string column containing a CSV string.
options : dict, optional
options to control parsing. accepts the same options as the CSV datasource.
Options to control parsing. Accepts the same options as the CSV datasource.
See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
for the version you use.

Expand All @@ -14946,15 +14946,53 @@ def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = N
Returns
-------
:class:`~pyspark.sql.Column`
a string representation of a :class:`StructType` parsed from given CSV.
A string representation of a :class:`StructType` parsed from the given CSV.

Examples
--------
Example 1: Inferring the schema of a CSV string with different data types

>>> from pyspark.sql import functions as sf
>>> df = spark.range(1)
>>> df.select(sf.schema_of_csv(sf.lit('1|a|true'), {'sep':'|'})).show(truncate=False)
+-------------------------------------------+
|schema_of_csv(1|a|true) |
+-------------------------------------------+
|STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+-------------------------------------------+

Example 2: Inferring the schema of a CSV string with missing values

>>> from pyspark.sql import functions as sf
>>> df = spark.range(1)
>>> df.select(sf.schema_of_csv(sf.lit('1||true'), {'sep':'|'})).show(truncate=False)
+-------------------------------------------+
|schema_of_csv(1||true) |
+-------------------------------------------+
|STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+-------------------------------------------+

Example 3: Inferring the schema of a CSV string with a different delimiter

>>> from pyspark.sql import functions as sf
>>> df = spark.range(1)
>>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()
[Row(csv='STRUCT<_c0: INT, _c1: STRING>')]
>>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()
[Row(csv='STRUCT<_c0: INT, _c1: STRING>')]
>>> df.select(sf.schema_of_csv(sf.lit('1;a;true'), {'sep':';'})).show(truncate=False)
+-------------------------------------------+
|schema_of_csv(1;a;true) |
+-------------------------------------------+
|STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+-------------------------------------------+

Example 4: Inferring the schema of a CSV string with quoted fields

>>> from pyspark.sql import functions as sf
>>> df = spark.range(1)
>>> df.select(sf.schema_of_csv(sf.lit('"1","a","true"'), {'sep':','})).show(truncate=False)
+-------------------------------------------+
|schema_of_csv("1","a","true") |
+-------------------------------------------+
|STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+-------------------------------------------+
"""
if isinstance(csv, str):
col = _create_column_from_literal(csv)
Expand All @@ -14969,10 +15007,12 @@ def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = N
return _invoke_function("schema_of_csv", col, _options_to_str(options))


# TODO(SPARK-46654) Re-enable the `Example 2` test after fixing the display
# difference between Regular Spark and Spark Connect on `df.show`.
@_try_remote_functions
def to_csv(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Column:
"""
Converts a column containing a :class:`StructType` into a CSV string.
CSV Function: Converts a column containing a :class:`StructType` into a CSV string.
Throws an exception, in the case of an unsupported type.

.. versionadded:: 3.0.0
Expand All @@ -14983,9 +15023,9 @@ def to_csv(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
name of column containing a struct.
Name of column containing a struct.
options: dict, optional
options to control converting. accepts the same options as the CSV datasource.
Options to control converting. Accepts the same options as the CSV datasource.
See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
for the version you use.

Expand All @@ -14994,15 +15034,65 @@ def to_csv(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col
Returns
-------
:class:`~pyspark.sql.Column`
a CSV string converted from given :class:`StructType`.
A CSV string converted from the given :class:`StructType`.

Examples
--------
>>> from pyspark.sql import Row
Example 1: Converting a simple StructType to a CSV string

>>> from pyspark.sql import Row, functions as sf
>>> data = [(1, Row(age=2, name='Alice'))]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(to_csv(df.value).alias("csv")).collect()
[Row(csv='2,Alice')]
>>> df.select(sf.to_csv(df.value)).show()
+-------------+
|to_csv(value)|
+-------------+
| 2,Alice|
+-------------+

Example 2: Converting a complex StructType to a CSV string

>>> from pyspark.sql import Row, functions as sf
>>> data = [(1, Row(age=2, name='Alice', scores=[100, 200, 300]))]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(sf.to_csv(df.value)).show(truncate=False) # doctest: +SKIP
+-----------------------+
|to_csv(value) |
+-----------------------+
|2,Alice,"[100,200,300]"|
+-----------------------+

Example 3: Converting a StructType with null values to a CSV string

>>> from pyspark.sql import Row, functions as sf
>>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType
>>> data = [(1, Row(age=None, name='Alice'))]
>>> schema = StructType([
... StructField("key", IntegerType(), True),
... StructField("value", StructType([
... StructField("age", IntegerType(), True),
... StructField("name", StringType(), True)
... ]), True)
... ])
>>> df = spark.createDataFrame(data, schema)
>>> df.select(sf.to_csv(df.value)).show()
+-------------+
|to_csv(value)|
+-------------+
| ,Alice|
+-------------+

Example 4: Converting a StructType with different data types to a CSV string

>>> from pyspark.sql import Row, functions as sf
>>> data = [(1, Row(age=2, name='Alice', isStudent=True))]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(sf.to_csv(df.value)).show()
+-------------+
|to_csv(value)|
+-------------+
| 2,Alice,true|
+-------------+
"""

return _invoke_function("to_csv", _to_java_column(col), _options_to_str(options))
Expand Down Expand Up @@ -16228,8 +16318,8 @@ def from_csv(
options: Optional[Dict[str, str]] = None,
) -> Column:
"""
Parses a column containing a CSV string to a row with the specified schema.
Returns `null`, in the case of an unparseable string.
CSV Function: Parses a column containing a CSV string into a row with the specified schema.
Returns `null` if the string cannot be parsed.

.. versionadded:: 3.0.0

Expand All @@ -16239,11 +16329,11 @@ def from_csv(
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
a column or column name in CSV format
schema :class:`~pyspark.sql.Column` or str
a column, or Python string literal with schema in DDL format, to use when parsing the CSV column.
A column or column name in CSV format.
schema : :class:`~pyspark.sql.Column` or str
A column, or Python string literal with schema in DDL format, to use when parsing the CSV column.
options : dict, optional
options to control parsing. accepts the same options as the CSV datasource.
Options to control parsing. Accepts the same options as the CSV datasource.
See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
for the version you use.

Expand All @@ -16252,22 +16342,71 @@ def from_csv(
Returns
-------
:class:`~pyspark.sql.Column`
a column of parsed CSV values
A column of parsed CSV values.

Examples
--------
Example 1: Parsing a simple CSV string

>>> from pyspark.sql import functions as sf
>>> data = [("1,2,3",)]
>>> df = spark.createDataFrame(data, ("value",))
>>> df.select(from_csv(df.value, "a INT, b INT, c INT").alias("csv")).collect()
[Row(csv=Row(a=1, b=2, c=3))]
>>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT")).show()
+---------------+
|from_csv(value)|
+---------------+
| {1, 2, 3}|
+---------------+

Example 2: Using schema_of_csv to infer the schema

>>> from pyspark.sql import functions as sf
>>> data = [("1,2,3",)]
>>> value = data[0][0]
>>> df.select(from_csv(df.value, schema_of_csv(value)).alias("csv")).collect()
[Row(csv=Row(_c0=1, _c1=2, _c2=3))]
>>> df.select(sf.from_csv(df.value, sf.schema_of_csv(value))).show()
+---------------+
|from_csv(value)|
+---------------+
| {1, 2, 3}|
+---------------+

Example 3: Ignoring leading white space in the CSV string

>>> from pyspark.sql import functions as sf
>>> data = [(" abc",)]
>>> df = spark.createDataFrame(data, ("value",))
>>> options = {'ignoreLeadingWhiteSpace': True}
>>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect()
[Row(csv=Row(s='abc'))]
>>> df.select(sf.from_csv(df.value, "s string", options)).show()
+---------------+
|from_csv(value)|
+---------------+
| {abc}|
+---------------+

Example 4: Parsing a CSV string with a missing value

>>> from pyspark.sql import functions as sf
>>> data = [("1,2,",)]
>>> df = spark.createDataFrame(data, ("value",))
>>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT")).show()
+---------------+
|from_csv(value)|
+---------------+
| {1, 2, NULL}|
+---------------+

Example 5: Parsing a CSV string with a different delimiter

>>> from pyspark.sql import functions as sf
>>> data = [("1;2;3",)]
>>> df = spark.createDataFrame(data, ("value",))
>>> options = {'delimiter': ';'}
>>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT", options)).show()
+---------------+
|from_csv(value)|
+---------------+
| {1, 2, 3}|
+---------------+
"""

_get_active_spark_context()
Expand Down

0 comments on commit bda9957

Please sign in to comment.