DATA Step Inputs and Outputs
In SAS clearly not all data movement happens in SQL. DATA steps read datasets via SET and MERGE, apply transformations, and write to output libraries. Extracting this I/O map is a core building block for lineage, documentation, and migration scoping.
Sample SAS
data work.customers_filtered;
set work.customers (keep=id name country);
where country = 'IT';
rename name=customer_name;
run;
Add Dependencies
- Kotlin
- Java
repositories {
mavenLocal()
mavenCentral()
flatDir {
dirs("deps")
}
}
dependencies {
implementation(files("deps/sas-parser-with-dependencies-1.6.5-all.jar"))
}
repositories {
mavenLocal()
mavenCentral()
flatDir {
dirs("deps")
}
}
dependencies {
implementation(files("deps/sas-parser-with-dependencies-1.6.5-all.jar"))
implementation "com.strumenta.kolasu:kolasu-javalib:1.5.96"
}
Model DATA step I/O
For each DataStep, collect output dataset names from datasets and input datasets from SetStatement and MergeStatement inside its statements list.
A DatasetSpec may carry an optional library and a name that is itself an AST node (Identifier, VariableExpression, …) — not always a plain string. Format both parts only when the parser actually provides them.
- Kotlin
- Java
import com.strumenta.kolasu.traversing.walk
import com.strumenta.kolasu.traversing.walkDescendants
import com.strumenta.sas.ast.Identifier
import com.strumenta.sas.ast.SourceFile
import com.strumenta.sas.ast.Variable
import com.strumenta.sas.ast.VariableExpression
import com.strumenta.sas.ast.datastep.DataStep
import com.strumenta.sas.ast.datastep.KeepStatement
import com.strumenta.sas.ast.datastep.MergeStatement
import com.strumenta.sas.ast.datastep.RenameStatement
import com.strumenta.sas.ast.datastep.SetStatement
import com.strumenta.sas.ast.other.DatasetSpec
import com.strumenta.kolasu.commercial.LicenseManager
import com.strumenta.sas.parser.SASLanguage
import java.io.File
data class DataStepIo(
val outputs: List<String>,
val inputs: List<String>,
val keptColumns: List<String>,
val renames: List<String>,
)
fun formatDatasetSpec(spec: DatasetSpec): String {
val name = spec.name
val textName = when (name) {
is VariableExpression -> name.variable
is Identifier -> name.name
else -> name?.toString() ?: "?"
}
return if (spec.library != null) "${spec.library}.$textName" else textName
}
fun analyzeDataSteps(root: SourceFile): List<DataStepIo> {
val result = mutableListOf<DataStepIo>()
root.walkDescendants(DataStep::class).forEach { step ->
val outputs = step.datasets.map(::formatDatasetSpec)
val inputs = mutableListOf<String>()
val kept = mutableListOf<String>()
val renames = mutableListOf<String>()
step.statements.forEach { stmt ->
when (stmt) {
is SetStatement -> stmt.sets.forEach { setSpec ->
setSpec.name?.let { inputs.add(formatDatasetSpec(it)) }
}
is MergeStatement -> stmt.datasets.forEach { inputs.add(formatDatasetSpec(it)) }
is KeepStatement -> stmt.walk().filterIsInstance<Variable>().forEach { v ->
v.name?.name?.let { kept.add(it) }
}
is RenameStatement -> stmt.names.forEach { id ->
renames.add(id.name ?: id.toString())
}
}
}
result.add(DataStepIo(outputs, inputs, kept, renames))
}
return result
}
fun main() {
LicenseManager.registerLicense(File("licenses/strumenta.SAS.license"))
val root = SASLanguage().parse(File("examples/SAS/all-the-code.sas")).root ?: return
analyzeDataSteps(root).forEach { io ->
println("DATA ${io.outputs} <- ${io.inputs}")
if (io.keptColumns.isNotEmpty()) println(" KEEP: ${io.keptColumns}")
if (io.renames.isNotEmpty()) println(" RENAME: ${io.renames}")
}
}
import com.strumenta.kolasu.javalib.Traversing;
import com.strumenta.sas.ast.Identifier;
import com.strumenta.sas.ast.SourceFile;
import com.strumenta.sas.ast.Statement;
import com.strumenta.sas.ast.Variable;
import com.strumenta.sas.ast.VariableExpression;
import com.strumenta.sas.ast.datastep.DataStep;
import com.strumenta.sas.ast.datastep.KeepStatement;
import com.strumenta.sas.ast.datastep.MergeStatement;
import com.strumenta.sas.ast.datastep.RenameStatement;
import com.strumenta.sas.ast.datastep.SetStatement;
import com.strumenta.sas.ast.datastep.SetSpecification;
import com.strumenta.sas.ast.other.DatasetSpec;
import com.strumenta.kolasu.commercial.LicenseManager;
import com.strumenta.sas.parser.SASLanguage;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
public class DataStepIo {
public static class IoRecord {
public final List<String> outputs;
public final List<String> inputs;
public final List<String> keptColumns;
public final List<String> renames;
public IoRecord(List<String> outputs, List<String> inputs,
List<String> keptColumns, List<String> renames) {
this.outputs = outputs;
this.inputs = inputs;
this.keptColumns = keptColumns;
this.renames = renames;
}
}
public static String formatDatasetSpec(DatasetSpec spec) {
String textName = "?";
if (spec.getName() instanceof VariableExpression ve)
textName = ve.getVariable();
else if (spec.getName() instanceof Identifier id)
textName = id.getName();
else if (spec.getName() != null)
textName = spec.getName().toString();
return spec.getLibrary() != null
? spec.getLibrary() + "." + textName
: textName;
}
public static List<IoRecord> analyzeDataSteps(SourceFile root) {
List<IoRecord> result = new ArrayList<>();
Traversing.walkDescendantsBreadthFirst(root, DataStep.class, step -> {
List<String> outputs = new ArrayList<>();
step.getDatasets().forEach(d -> outputs.add(formatDatasetSpec(d)));
List<String> inputs = new ArrayList<>();
List<String> kept = new ArrayList<>();
List<String> renames = new ArrayList<>();
for (Statement stmt : step.getStatements()) {
if (stmt instanceof SetStatement setStmt) {
for (SetSpecification setSpec : setStmt.getSets()) {
if (setSpec.getName() != null) {
inputs.add(formatDatasetSpec(setSpec.getName()));
}
}
} else if (stmt instanceof MergeStatement merge) {
merge.getDatasets().forEach(d -> inputs.add(formatDatasetSpec(d)));
} else if (stmt instanceof KeepStatement keep) {
Traversing.walk(keep).forEach(n -> {
if (n instanceof Variable v && v.getName() != null) {
kept.add(v.getName().getName());
}
});
} else if (stmt instanceof RenameStatement rename) {
rename.getNames().forEach(id ->
renames.add(id.getName() != null ? id.getName() : id.toString()));
}
}
result.add(new IoRecord(outputs, inputs, kept, renames));
});
return result;
}
}
Build a lineage edge list
Merge DATA step I/O with SQL lineage output to produce (source → target) edges across an entire program or directory. That graph powers impact analysis (“what reads work.customers?”) and automated documentation.