[SPARK-46635][PYTHON][DOCS] Refine docstring of `from_csv/schema_of_c…

…sv/to_csv` ### What changes were proposed in this pull request? This pr refine docstring of `from_csv/schema_of_csv/to_csv` and add some new examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#44639 from LuciferYang/csv-functions. Authored-by: yangjie01 <[email protected]> Signed-off-by: yangjie01 <[email protected]>
decentriq · Jan 10, 2024 · bda9957 · bda9957
1 parent fcdfc8c
commit bda9957
Showing 1 changed file with 167 additions and 28 deletions.
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -14925,7 +14925,7 @@ def to_xml(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col
 @_try_remote_functions
 def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = None) -> Column:
     """
-    Parses a CSV string and infers its schema in DDL format.
+    CSV Function: Parses a CSV string and infers its schema in DDL format.
 
     .. versionadded:: 3.0.0
 
@@ -14935,9 +14935,9 @@ def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = N
     Parameters
     ----------
     csv : :class:`~pyspark.sql.Column` or str
-        a CSV string or a foldable string column containing a CSV string.
+        A CSV string or a foldable string column containing a CSV string.
     options : dict, optional
-        options to control parsing. accepts the same options as the CSV datasource.
+        Options to control parsing. Accepts the same options as the CSV datasource.
         See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
         for the version you use.
 
@@ -14946,15 +14946,53 @@ def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = N
     Returns
     -------
     :class:`~pyspark.sql.Column`
-        a string representation of a :class:`StructType` parsed from given CSV.
+        A string representation of a :class:`StructType` parsed from the given CSV.
 
     Examples
     --------
+    Example 1: Inferring the schema of a CSV string with different data types
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.range(1)
+    >>> df.select(sf.schema_of_csv(sf.lit('1|a|true'), {'sep':'|'})).show(truncate=False)
+    +-------------------------------------------+
+    |schema_of_csv(1|a|true)                    |
+    +-------------------------------------------+
+    |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+    +-------------------------------------------+
+
+    Example 2: Inferring the schema of a CSV string with missing values
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.range(1)
+    >>> df.select(sf.schema_of_csv(sf.lit('1||true'), {'sep':'|'})).show(truncate=False)
+    +-------------------------------------------+
+    |schema_of_csv(1||true)                     |
+    +-------------------------------------------+
+    |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+    +-------------------------------------------+
+
+    Example 3: Inferring the schema of a CSV string with a different delimiter
+
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.range(1)
-    >>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()
-    [Row(csv='STRUCT<_c0: INT, _c1: STRING>')]
-    >>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()
-    [Row(csv='STRUCT<_c0: INT, _c1: STRING>')]
+    >>> df.select(sf.schema_of_csv(sf.lit('1;a;true'), {'sep':';'})).show(truncate=False)
+    +-------------------------------------------+
+    |schema_of_csv(1;a;true)                    |
+    +-------------------------------------------+
+    |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+    +-------------------------------------------+
+
+    Example 4: Inferring the schema of a CSV string with quoted fields
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.range(1)
+    >>> df.select(sf.schema_of_csv(sf.lit('"1","a","true"'), {'sep':','})).show(truncate=False)
+    +-------------------------------------------+
+    |schema_of_csv("1","a","true")              |
+    +-------------------------------------------+
+    |STRUCT<_c0: INT, _c1: STRING, _c2: BOOLEAN>|
+    +-------------------------------------------+
     """
     if isinstance(csv, str):
         col = _create_column_from_literal(csv)
@@ -14969,10 +15007,12 @@ def schema_of_csv(csv: Union[Column, str], options: Optional[Dict[str, str]] = N
     return _invoke_function("schema_of_csv", col, _options_to_str(options))
 
 
+# TODO(SPARK-46654) Re-enable the `Example 2` test after fixing the display
+#  difference between Regular Spark and Spark Connect on `df.show`.
 @_try_remote_functions
 def to_csv(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Column:
     """
-    Converts a column containing a :class:`StructType` into a CSV string.
+    CSV Function: Converts a column containing a :class:`StructType` into a CSV string.
     Throws an exception, in the case of an unsupported type.
 
     .. versionadded:: 3.0.0
@@ -14983,9 +15023,9 @@ def to_csv(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
-        name of column containing a struct.
+        Name of column containing a struct.
     options: dict, optional
-        options to control converting. accepts the same options as the CSV datasource.
+        Options to control converting. Accepts the same options as the CSV datasource.
         See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
         for the version you use.
 
@@ -14994,15 +15034,65 @@ def to_csv(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> Col
     Returns
     -------
     :class:`~pyspark.sql.Column`
-        a CSV string converted from given :class:`StructType`.
+        A CSV string converted from the given :class:`StructType`.
 
     Examples
     --------
-    >>> from pyspark.sql import Row
+    Example 1: Converting a simple StructType to a CSV string
+
+    >>> from pyspark.sql import Row, functions as sf
     >>> data = [(1, Row(age=2, name='Alice'))]
     >>> df = spark.createDataFrame(data, ("key", "value"))
-    >>> df.select(to_csv(df.value).alias("csv")).collect()
-    [Row(csv='2,Alice')]
+    >>> df.select(sf.to_csv(df.value)).show()
+    +-------------+
+    |to_csv(value)|
+    +-------------+
+    |      2,Alice|
+    +-------------+
+
+    Example 2: Converting a complex StructType to a CSV string
+
+    >>> from pyspark.sql import Row, functions as sf
+    >>> data = [(1, Row(age=2, name='Alice', scores=[100, 200, 300]))]
+    >>> df = spark.createDataFrame(data, ("key", "value"))
+    >>> df.select(sf.to_csv(df.value)).show(truncate=False) # doctest: +SKIP
+    +-----------------------+
+    |to_csv(value)          |
+    +-----------------------+
+    |2,Alice,"[100,200,300]"|
+    +-----------------------+
+
+    Example 3: Converting a StructType with null values to a CSV string
+
+    >>> from pyspark.sql import Row, functions as sf
+    >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType
+    >>> data = [(1, Row(age=None, name='Alice'))]
+    >>> schema = StructType([
+    ...   StructField("key", IntegerType(), True),
+    ...   StructField("value", StructType([
+    ...     StructField("age", IntegerType(), True),
+    ...     StructField("name", StringType(), True)
+    ...   ]), True)
+    ... ])
+    >>> df = spark.createDataFrame(data, schema)
+    >>> df.select(sf.to_csv(df.value)).show()
+    +-------------+
+    |to_csv(value)|
+    +-------------+
+    |       ,Alice|
+    +-------------+
+
+    Example 4: Converting a StructType with different data types to a CSV string
+
+    >>> from pyspark.sql import Row, functions as sf
+    >>> data = [(1, Row(age=2, name='Alice', isStudent=True))]
+    >>> df = spark.createDataFrame(data, ("key", "value"))
+    >>> df.select(sf.to_csv(df.value)).show()
+    +-------------+
+    |to_csv(value)|
+    +-------------+
+    | 2,Alice,true|
+    +-------------+
     """
 
     return _invoke_function("to_csv", _to_java_column(col), _options_to_str(options))
@@ -16228,8 +16318,8 @@ def from_csv(
     options: Optional[Dict[str, str]] = None,
 ) -> Column:
     """
-    Parses a column containing a CSV string to a row with the specified schema.
-    Returns `null`, in the case of an unparseable string.
+    CSV Function: Parses a column containing a CSV string into a row with the specified schema.
+    Returns `null` if the string cannot be parsed.
 
     .. versionadded:: 3.0.0
 
@@ -16239,11 +16329,11 @@ def from_csv(
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
-        a column or column name in CSV format
-    schema :class:`~pyspark.sql.Column` or str
-        a column, or Python string literal with schema in DDL format, to use when parsing the CSV column.
+        A column or column name in CSV format.
+    schema : :class:`~pyspark.sql.Column` or str
+        A column, or Python string literal with schema in DDL format, to use when parsing the CSV column.
     options : dict, optional
-        options to control parsing. accepts the same options as the CSV datasource.
+        Options to control parsing. Accepts the same options as the CSV datasource.
         See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_
         for the version you use.
 
@@ -16252,22 +16342,71 @@ def from_csv(
     Returns
     -------
     :class:`~pyspark.sql.Column`
-        a column of parsed CSV values
+        A column of parsed CSV values.
 
     Examples
     --------
+    Example 1: Parsing a simple CSV string
+
+    >>> from pyspark.sql import functions as sf
     >>> data = [("1,2,3",)]
     >>> df = spark.createDataFrame(data, ("value",))
-    >>> df.select(from_csv(df.value, "a INT, b INT, c INT").alias("csv")).collect()
-    [Row(csv=Row(a=1, b=2, c=3))]
+    >>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT")).show()
+    +---------------+
+    |from_csv(value)|
+    +---------------+
+    |      {1, 2, 3}|
+    +---------------+
+
+    Example 2: Using schema_of_csv to infer the schema
+
+    >>> from pyspark.sql import functions as sf
+    >>> data = [("1,2,3",)]
     >>> value = data[0][0]
-    >>> df.select(from_csv(df.value, schema_of_csv(value)).alias("csv")).collect()
-    [Row(csv=Row(_c0=1, _c1=2, _c2=3))]
+    >>> df.select(sf.from_csv(df.value, sf.schema_of_csv(value))).show()
+    +---------------+
+    |from_csv(value)|
+    +---------------+
+    |      {1, 2, 3}|
+    +---------------+
+
+    Example 3: Ignoring leading white space in the CSV string
+
+    >>> from pyspark.sql import functions as sf
     >>> data = [("   abc",)]
     >>> df = spark.createDataFrame(data, ("value",))
     >>> options = {'ignoreLeadingWhiteSpace': True}
-    >>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect()
-    [Row(csv=Row(s='abc'))]
+    >>> df.select(sf.from_csv(df.value, "s string", options)).show()
+    +---------------+
+    |from_csv(value)|
+    +---------------+
+    |          {abc}|
+    +---------------+
+
+    Example 4: Parsing a CSV string with a missing value
+
+    >>> from pyspark.sql import functions as sf
+    >>> data = [("1,2,",)]
+    >>> df = spark.createDataFrame(data, ("value",))
+    >>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT")).show()
+    +---------------+
+    |from_csv(value)|
+    +---------------+
+    |   {1, 2, NULL}|
+    +---------------+
+
+    Example 5: Parsing a CSV string with a different delimiter
+
+    >>> from pyspark.sql import functions as sf
+    >>> data = [("1;2;3",)]
+    >>> df = spark.createDataFrame(data, ("value",))
+    >>> options = {'delimiter': ';'}
+    >>> df.select(sf.from_csv(df.value, "a INT, b INT, c INT", options)).show()
+    +---------------+
+    |from_csv(value)|
+    +---------------+
+    |      {1, 2, 3}|
+    +---------------+
     """
 
     _get_active_spark_context()