Profiling scripts are associated with extensible connections and are used to perform data profiling of source systems. There are a number of environment variables and token replacements available to developers when writing profiling scripts. All profiling scripts must return information about the data profiling result in a specific JSON format.
Environment variables in scripts
The following script environment variables are available to all profiling scripts.
Variable | Description |
---|---|
WSL_SRCCFG_<configured_field_name> | The values of configured connection properties can be accessed using environment variables. |
WSL_CON_JSON | The full path to a file containing the output from the most recent browse script execution. |
WSL_BINDIR | The WhereScape 3D installation path. (The default location is 'C:\Program Files\WhereScape\WhereScape 3D'.) |
WSL_WORKDIR | The working directory where the script is executed. (The working directory will be in %UserProfile%\WhereScape\3D\scriptExecution.) |
WSL_TASK_NAME | A name used to identify the workflow task. |
WSL_SEQUENCE | A sequence id to identify each workflow process. |
WSL_WRKFL_Repo | The repository name associated with the object selected in the repositories pane. |
WSL_WRKFL_Category | The category name associated with the object selected in the repositories pane. |
WSL_WRKFL_Model | The model name associated with the object selected in the repositories pane. |
WSL_WRKFL_Version | The version name associated with the object selected in the repositories pane. |
WSL_WRKFL_Group | The group name associated with the object selected in the repositories pane. |
WSL_WRKFL_Table | The table name associated with the object selected in the repositories pane. If no table object is selected then the value should be an empty string. |
WSL_WRKFL_Column | The column name associated with the object selected in the repositories pane. If no table object is selected then the value should be an empty string. |
Profiling script output
Each profiling script should return a set of JSON containing the results of the data profiling process. An example of the output required is displayed below.
{ "table_metrics": [ { "name": "Rowcount", "description": "The number of rows in the table.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "value": "100" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "value": "15" } ] }, { "name": "Space used in MB", "description": "The amount of space used by the table in MB.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "value": "10" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "value": "2" } ] }, { "name": "Average row length", "description": "The average length per row in the table.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "value": "10" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "value": "6" } ] } ], "column_metrics": [ { "name": "Inferred data type", "description": "The WhereScape 3D data type inferred from the values stored in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "INTEGER" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "VARCHAR(50)" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "INTEGER" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "INTEGER" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "VARCHAR(50)" } ] }, { "name": "Distinct value count", "description": "The number of unique values (excluding NULL) in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "53" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "52" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "4" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "4" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "4" } ] }, { "name": "Is unique", "description": "Is the data in this attribute unique for every row.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "True" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "False" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "False" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "True" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "True" } ] }, { "name": "Null count", "description": "The number of null values in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "0" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "0" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "0" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "0" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "0" } ] }, { "name": "Null %", "description": "The percentage of null values in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "0.0" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "0.0" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "0.0" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "0.0" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "0.0" } ] }, { "name": "Populated count", "description": "The number of non-null values in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "53" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "53" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "53" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "4" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "4" } ] }, { "name": "Populated %", "description": "The percentage of non-null values in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "100.0" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "100.0" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "100.0" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "100.0" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "100.0" } ] }, { "name": "Min value", "description": "The minimum value in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "01581" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "Atlanta" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "1" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "1" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "Eastern" } ] }, { "name": "Max value", "description": "The maximum value in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "98104" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "Wilton" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "4" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "4" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "Western" } ] }, { "name": "Min character count", "description": "The minimum number of characters in at least one value of this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "5" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "4" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "7" } ] }, { "name": "Max character count", "description": "The maximum number of characters in at least one value of this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "5" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "16" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "8" } ] }, { "name": "Avg character count", "description": "The average number of characters in at least one value of this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "5" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "9" } ] }, { "name": "Max trimmed length", "description": "The maximum trimmed length in at least one value of this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "value": "5" }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "value": "16" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "value": "8" } ] }, { "name": "Sum value", "description": "The sum of all the values in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "114" } ] }, { "name": "Avg value", "description": "The average of all the values in this column.", "type": "single_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "value": "2.15" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "value": "2.5" } ] }, { "name": "Top 10 most frequent values", "description": "The top 10 most frequent values in this column.", "type": "pair_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "key": [ "01581", "01730", "01833", "02116", "02139", "02184", "02903", "03049", "03801", "06897" ], "value": [ "1", "1", "1", "1", "1", "1", "1", "1", "1", "1" ] }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "key": [ "Atlanta", "Austin", "Beachwood", "Bedford", "Bellevue", "Bentonville", "Bloomfield Hills", "Boston", "Braintree", "Cambridge" ], "value": [ "1", "1", "1", "1", "1", "1", "1", "1", "1", "1" ] }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "key": [ "" ], "value": "" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "key": [ "1", "2", "3", "4" ], "value": [ "1", "1", "1", "1" ] }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "key": [ "Eastern", "Northern", "Sothern", "Western" ], "value": [ "1", "1", "1", "1" ] } ] }, { "name": "Top 10 least frequent values", "description": "The top 10 least frequent values in this column.", "type": "pair_value", "metric_values": [ { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "Territory ID", "key": [ "01581", "01730", "01833", "02116", "02139", "02184", "02903", "03049", "03801", "06897" ], "value": [ "1", "1", "1", "1", "1", "1", "1", "1", "1", "1" ] }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "TerritoryDescription", "key": [ "New York", "Neward", "Orlando", "Philadelphia", "Phonix", "Portsmouth", "Providence", "Racine", "Redmond", "Rockville" ], "value": [ "2", "1", "1", "1", "1", "1", "1", "1", "1", "1" ] }, { "schema": "D:\\Northwind\\csv", "table": "Territories", "column": "RegionID", "key": [ "" ], "value": "" }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionID", "key": [ "1", "2", "3", "4" ], "value": [ "1", "1", "1", "1" ] }, { "schema": "D:\\Northwind\\csv", "table": "Region", "column": "RegionDescription", "key": [ "Eastern", "Northern", "Sothern", "Western" ], "value": [ "1", "1", "1", "1" ] } ] } ], "relationship_metrics": [ { "name": "In use %", "description": "Percentage of keys used by the referencing table.", "type": "single_value", "metric_values": [ { "pk_schema": "D:\\Northwind\\csv", "pk_table": "Region", "pk_column": "RegionID", "fk_schema": "D:\\Northwind\\csv", "fk_table": "Territories", "fk_column": "RegionID", "value": "100.0" } ] }, { "name": "Referentially invalid values %", "description": "Percentage of values in the referencing foreign key field not in the referenced field.", "type": "single_value", "metric_values": [ { "pk_schema": "D:\\Northwind\\csv", "pk_table": "Region", "pk_column": "RegionID", "fk_schema": "D:\\Northwind\\csv", "fk_table": "Territories", "fk_column": "RegionID", "value": "0.0" } ] }, { "name": "Top 10 used PKs", "description": "The top 10 most frequent used PK values.", "type": "pair_value", "metric_values": [ { "pk_schema": "D:\\Northwind\\csv", "pk_table": "Region", "pk_column": "RegionID", "fk_schema": "D:\\Northwind\\csv", "fk_table": "Territories", "fk_column": "RegionID", "key": [ "1", "2", "3", "4" ], "value": [ "19", "15", "11", "8" ] } ] } ] }