Executing Kernels¶
In CUDA-Q, there are 4 ways in which one can execute quantum kernels:
sample
: yields measurement countsrun
: yields individual return values from multiple executionsobserve
: yields expectation valuesget_state
: yields the quantum statevector of the computation
Asynchronous programming is a technique that enables your program to start a potentially long-running task and still be able to be responsive to other events while that task runs, rather than having to wait until that task has finished. Once that task has finished, your program is presented with the result. The most intensive task in the computation is the execution of the quantum kernel hence each execution function can be parallelized given access to multiple quantum processing units (multi-QPU) using: sample_async
, run_async
, observe_async
and get_state_async
.
Since multi-QPU platforms are not yet feasible, we emulate each QPU with a GPU.
Sample¶
Quantum states collapse upon measurement and hence need to be sampled many times to gather statistics. The CUDA-Q sample
call enables this:
import cudaq
import numpy as np
qubit_count = 2
# Define the simulation target.
cudaq.set_target("qpp-cpu")
# Define a quantum kernel function.
@cudaq.kernel
def kernel(qubit_count: int):
qvector = cudaq.qvector(qubit_count)
# 2-qubit GHZ state.
h(qvector[0])
for i in range(1, qubit_count):
x.ctrl(qvector[0], qvector[i])
# If we do not specify measurements, all qubits are measured in
# the Z-basis by default or we can manually specify it also
mz(qvector)
print(cudaq.draw(kernel, qubit_count))
result = cudaq.sample(kernel, qubit_count, shots_count=1000)
print(result)
#include <cstdio>
#include <cudaq.h>
#include <cudaq/algorithms/draw.h>
// Define a quantum kernel function.
__qpu__ void kernel(int qubit_count) {
cudaq::qvector qvector(qubit_count);
// 2-qubit GHZ state.
h(qvector[0]);
for (auto qubit : cudaq::range(qubit_count - 1)) {
x<cudaq::ctrl>(qvector[qubit], qvector[qubit + 1]);
}
// If we do not specify measurements, all qubits are measured in
// the Z-basis by default or we can manually specify it also
mz(qvector);
}
int main() {
int qubit_count = 2;
auto produced_str = cudaq::contrib::draw(kernel, qubit_count);
printf("%s", produced_str.data());
auto result = cudaq::sample(kernel, qubit_count);
result.dump();
return 0;
}
╭───╮
q0 : ┤ h ├──●──
╰───╯╭─┴─╮
q1 : ─────┤ x ├
╰───╯
{ 11:506 00:494 }
Note that there is a subtle difference between how sample
is executed with the target device set to a simulator or with the target device set to a QPU. In simulation mode, the quantum state is built once and then sampled \(s\) times where \(s\) equals the shots_count
. In hardware execution mode, the quantum state collapses upon measurement and hence needs to be rebuilt over and over again.
There are a number of helpful tools that can be found in the API docs to process the Sample_Result
object produced by sample
.
Sample Asynchronous¶
sample
also supports asynchronous execution for the sample_async arguments it accepts. One can parallelize over various kernels, variational parameters or even distribute shots counts over multiple QPUs.
result_async = cudaq.sample_async(kernel, qubit_count, shots_count=1000)
print(result_async.get())
#include <cstdio>
#include <cudaq.h>
#include <cudaq/algorithms/draw.h>
// Define a quantum kernel function.
__qpu__ void kernel(int qubit_count) {
cudaq::qvector qvector(qubit_count);
// 2-qubit GHZ state.
h(qvector[0]);
for (auto qubit : cudaq::range(qubit_count - 1)) {
x<cudaq::ctrl>(qvector[qubit], qvector[qubit + 1]);
}
// If we do not specify measurements, all qubits are measured in
// the Z-basis by default or we can manually specify it also
mz(qvector);
}
int main() {
int qubit_count = 2;
auto result = cudaq::sample_async(kernel, qubit_count);
result.get().dump();
return 0;
}
{ 00:498 11:502 }
Run¶
The run
API executes a quantum kernel multiple times and returns each individual result. Unlike sample
, which collects measurement statistics as counts, run
preserves each individual return value from every execution. This is useful when you need to analyze the distribution of returned values rather than just aggregated measurement counts.
Key points about run
:
Requires a kernel that returns a non-void value
Returns a list containing all individual execution results
Supports scalar types (bool, int, float) and custom data classes as return types
# Define a quantum kernel that returns an integer
@cudaq.kernel
def simple_ghz(num_qubits: int) -> int:
# Allocate qubits
qubits = cudaq.qvector(num_qubits)
# Create GHZ state
h(qubits[0])
for i in range(1, num_qubits):
x.ctrl(qubits[0], qubits[i])
# Measure and return total number of qubits in state |1⟩
result = 0
for i in range(num_qubits):
if mz(qubits[i]):
result += 1
return result
# Execute the kernel 20 times
num_qubits = 3
results = cudaq.run(simple_ghz, num_qubits, shots_count=20)
print(f"Executed {len(results)} shots")
print(f"Results: {results}")
print(f"Possible values: Either 0 or {num_qubits} due to GHZ state properties")
# Count occurrences of each result
value_counts = {}
for value in results:
value_counts[value] = value_counts.get(value, 0) + 1
print("\nCounts of each result:")
for value, count in value_counts.items():
print(f"{value}: {count} times")
#include <cstdio>
#include <cudaq.h>
// Define a quantum kernel that returns an integer
__qpu__ int simple_ghz(int num_qubits) {
// Allocate qubits
cudaq::qvector qubits(num_qubits);
// Create GHZ state
h(qubits[0]);
for (int i = 1; i < num_qubits; i++) {
x<cudaq::ctrl>(qubits[0], qubits[i]);
}
// Measure and return total number of qubits in state |1⟩
int result = 0;
for (int i = 0; i < num_qubits; i++) {
if (mz(qubits[i])) {
result += 1;
}
}
return result;
}
int main() {
// Execute the kernel 20 times
auto results = cudaq::run(20, simple_ghz, 3);
// Print results
printf("Executed %zu shots\n", results.size());
printf("Results: [");
for (size_t i = 0; i < results.size(); ++i) {
printf("%d", results[i]);
if (i < results.size() - 1) {
printf(", ");
}
}
printf("]\n");
printf("Possible values: Either 0 or %d due to GHZ state properties\n", 3);
// Count occurrences of each result
std::map<int, int> value_counts;
for (const auto &value : results) {
value_counts[value]++;
}
printf("\nCounts of each result:\n");
for (const auto &pair : value_counts) {
printf("%d: %d times\n", pair.first, pair.second);
}
return 0;
}
Executed 20 shots
Results: [0, 3, 0, 3, 3, 3, 0, 3, 3, 3, 0, 0, 3, 0, 3, 3, 0, 3, 3, 3]
Possible values: Either 0 or 3 due to GHZ state properties
Counts of each result:
0: 8 times
3: 12 times
Return Custom Data Types¶
The run
API also supports returning custom data types using Python’s data classes. This allows returning multiple values from your quantum computation in a structured way.
from dataclasses import dataclass
# Define a custom `dataclass` to return from our quantum kernel
@dataclass(slots=True)
class MeasurementResult:
first_qubit: bool
last_qubit: bool
total_ones: int
@cudaq.kernel
def bell_pair_with_data() -> MeasurementResult:
# Create a bell pair
qubits = cudaq.qvector(2)
h(qubits[0])
x.ctrl(qubits[0], qubits[1])
# Measure both qubits
first_result = mz(qubits[0])
last_result = mz(qubits[1])
# Return custom data structure with results
total = 0
if first_result:
total = 1
if last_result:
total = total + 1
return MeasurementResult(first_result, last_result, total)
# Run the kernel 10 times and get all results
results = cudaq.run(bell_pair_with_data, shots_count=10)
# Analyze the results
print("Individual measurement results:")
for i, res in enumerate(results):
print(
f"Shot {i}: {{{res.first_qubit}, {res.last_qubit}}}\ttotal ones={res.total_ones}"
)
# Verify the Bell state correlations
correlated_count = sum(
1 for res in results if res.first_qubit == res.last_qubit)
print(
f"\nCorrelated measurements: {correlated_count}/{len(results)} ({correlated_count/len(results)*100:.1f}%)"
)
#include <cstdio>
#include <cudaq.h>
// Define a custom data structure to return from the quantum kernel
struct MeasurementResult {
bool first_qubit;
bool last_qubit;
int total;
};
// Define a quantum kernel that returns the custom data structure
__qpu__ MeasurementResult bell_pair_with_data() {
// Create a Bell pair
cudaq::qvector qubits(2);
h(qubits[0]);
x<cudaq::ctrl>(qubits[0], qubits[1]);
bool m0 = mz(qubits[0]);
bool m1 = mz(qubits[1]);
int total = 0;
if (m0)
total++;
if (m1)
total++;
return {m0, m1, total};
}
int main() {
auto results = cudaq::run(10, bell_pair_with_data);
int correlated_count = 0;
for (auto i = 0; i < results.size(); ++i) {
printf("Shot %d: {%d, %d} total ones=%d\n", i, results[i].first_qubit,
results[i].last_qubit, results[i].total);
if (results[i].first_qubit == results[i].last_qubit)
correlated_count++;
}
printf("Correlated measurements: %d/%zu\n", correlated_count, results.size());
return 0;
}
Individual measurement results:
Shot 0: {True, True} total ones=2
Shot 1: {False, False} total ones=0
Shot 2: {False, False} total ones=0
Shot 3: {False, False} total ones=0
Shot 4: {True, True} total ones=2
Shot 5: {True, True} total ones=2
Shot 6: {True, True} total ones=2
Shot 7: {False, False} total ones=0
Shot 8: {False, False} total ones=0
Shot 9: {True, True} total ones=2
Correlated measurements: 10/10 (100.0%)
Run Asynchronous¶
Similar to sample_async
, run
also supports asynchronous execution for the run_async arguments it accepts.
# Example of `run_async` with a simple integer return type
# Define a quantum kernel that returns an integer
@cudaq.kernel
def simple_count(angle: float) -> int:
q = cudaq.qubit()
rx(angle, q)
return int(mz(q))
# Execute asynchronously with different parameters
futures = []
angles = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4]
for i, angle in enumerate(angles):
futures.append(cudaq.run_async(simple_count, angle, shots_count=10))
# Process results as they complete
for i, future in enumerate(futures):
results = future.get()
ones_count = sum(results)
print(f"Angle {angles[i]:.1f}: {ones_count}/10 ones measured")
#include <cstdio>
#include <cudaq.h>
#include <vector>
// Define a quantum kernel that returns an integer based on measurement
__qpu__ int simple_count(double angle) {
auto q = cudaq::qubit();
rx(angle, q);
return mz(q);
}
int main() {
// Execute the kernel asynchronously with different parameters
std::vector<float> angles = {0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4};
std::vector<std::future<std::vector<int>>> futures;
for (auto i = 0; i < angles.size(); ++i) {
futures.push_back(cudaq::run_async(0, 10, simple_count, angles[i]));
}
for (auto i = 0; i < futures.size(); ++i) {
std::vector<int> results = futures[i].get();
int ones_count = std::accumulate(results.begin(), results.end(), 0);
printf("Angle %.1f : %d/10 ones measured\n", angles[i], ones_count);
}
return 0;
}
Angle 0.0: 0/10 ones measured
Angle 0.2: 0/10 ones measured
Angle 0.4: 0/10 ones measured
Angle 0.6: 0/10 ones measured
Angle 0.8: 1/10 ones measured
Angle 1.0: 2/10 ones measured
Angle 1.2: 3/10 ones measured
Angle 1.4: 5/10 ones measured
Note
Currently, run
and run_async
are only supported on simulator targets.
Observe¶
The observe
function allows us to calculate expectation values. We must supply a spin operator in the form of a Hamiltonian, \(H\), from which we would like to calculate \(\langle\psi|H|\psi\rangle\).
from cudaq import spin
# Define a Hamiltonian in terms of Pauli Spin operators.
hamiltonian = spin.z(0) + spin.y(1) + spin.x(0) * spin.z(0)
@cudaq.kernel
def kernel1(n_qubits: int):
qubits = cudaq.qvector(n_qubits)
h(qubits[0])
for i in range(1, n_qubits):
x.ctrl(qubits[0], qubits[i])
# Compute the expectation value given the state prepared by the kernel.
result = cudaq.observe(kernel1, hamiltonian, qubit_count).expectation()
print('<H> =', result)
#include <cstdio>
#include <cudaq.h>
#include <cudaq/algorithms/draw.h>
// Define a quantum kernel function.
__qpu__ void kernel(int qubit_count) {
cudaq::qvector qvector(qubit_count);
// 2-qubit GHZ state.
h(qvector[0]);
for (auto qubit : cudaq::range(qubit_count - 1)) {
x<cudaq::ctrl>(qvector[qubit], qvector[qubit + 1]);
}
}
int main() {
int qubit_count = 2;
// Define a Hamiltonian in terms of Pauli Spin operators.
auto hamiltonian = cudaq::spin::z(0) + cudaq::spin::y(1) +
cudaq::spin::x(0) * cudaq::spin::z(0);
// Compute the expectation value given the state prepared by the kernel.
auto result = cudaq::observe(kernel, hamiltonian, qubit_count);
printf("%.6lf\n", result.expectation());
return 0;
}
<H> = 0.0
Observe Asynchronous¶
observe
can be a time intensive task. We can parallelize the execution of observe
via the arguments it accepts.
# Define a quantum kernel function.
@cudaq.kernel
def kernel1(qubit_count: int):
qvector = cudaq.qvector(qubit_count)
# 2-qubit GHZ state.
h(qvector[0])
for i in range(1, qubit_count):
x.ctrl(qvector[0], qvector[i])
# Measuring the expectation value of 2 different Hamiltonians in parallel
hamiltonian_1 = spin.x(0) + spin.y(1) + spin.z(0) * spin.y(1)
# Asynchronous execution on multiple `qpus` via `nvidia` `gpus`.
result_1 = cudaq.observe_async(kernel1, hamiltonian_1, qubit_count, qpu_id=0)
# Retrieve results
print(result_1.get().expectation())
#include <cstdio>
#include <cudaq.h>
#include <cudaq/algorithms/draw.h>
// Define a quantum kernel function.
__qpu__ void kernel(int qubit_count) {
cudaq::qvector qvector(qubit_count);
// 2-qubit GHZ state.
h(qvector[0]);
for (auto qubit : cudaq::range(qubit_count - 1)) {
x<cudaq::ctrl>(qvector[qubit], qvector[qubit + 1]);
}
}
int main() {
int qubit_count = 2;
// Define a Hamiltonian in terms of Pauli Spin operators.
// Measuring the expectation value of 2 different Hamiltonians in parallel
auto hamiltonian = cudaq::spin::x(0) + cudaq::spin::y(1) +
cudaq::spin::z(0) * cudaq::spin::y(1);
// Asynchronous execution on multiple `qpus` via `nvidia` `gpus`.
auto future = cudaq::observe_async(0, kernel, hamiltonian, qubit_count);
auto result = future.get();
printf("%.6lf\n", result.expectation());
return 0;
}
1.1102230246251565e-16
Above we parallelized the observe
call over the hamiltonian
parameter; however, we can parallelize over any of the arguments it accepts by just iterating over the qpu_id
.
Get State¶
The get_state
function gives us access to the quantum statevector of the computation. Remember, that this is only feasible in simulation mode.
# Compute the statevector of the kernel
result = cudaq.get_state(kernel, qubit_count)
print(np.array(result))
#include <cstdio>
#include <cudaq.h>
#include <cudaq/algorithms/draw.h>
// Define a quantum kernel function.
__qpu__ void kernel(int qubit_count) {
cudaq::qvector qvector(qubit_count);
// 2-qubit GHZ state.
h(qvector[0]);
for (auto qubit : cudaq::range(qubit_count - 1)) {
x<cudaq::ctrl>(qvector[qubit], qvector[qubit + 1]);
}
}
int main() {
int qubit_count = 2;
// Compute the statevector of the kernel
cudaq::state t = cudaq::get_state(kernel, qubit_count);
t.dump();
return 0;
}
[0.+0.j 0.+0.j 0.+0.j 1.+0.j]
The statevector generated by the get_state
command follows Big-endian convention for associating numbers with their binary representations, which places the least significant bit on the left. That is, for the example of a 2-bit system, we have the following translation between integers and bits:
Get State Asynchronous¶
Similar to observe_async
above, get_state
also supports asynchronous execution for the arguments it accepts.
import numpy as np
@cudaq.kernel
def bell_state():
q = cudaq.qvector(2)
h(q[0])
x.ctrl(q[0], q[1])
# Get state asynchronously
state_future = cudaq.get_state_async(bell_state)
# Do other work while waiting for state computation...
print("Computing state asynchronously...")
# Get the state when ready
state = state_future.get()
print("Bell state vector:")
print(np.array(state))
#include <cstdio>
#include <cudaq.h>
// Define a quantum kernel for Bell state preparation
__qpu__ void bell_state() {
cudaq::qvector qubits(2);
h(qubits[0]);
x<cudaq::ctrl>(qubits[0], qubits[1]);
}
int main() {
// Get state asynchronously
auto state_future = cudaq::get_state_async(bell_state);
// Do other work while waiting for state computation
// Get and print the state when ready
auto state = state_future.get();
state.dump();
return 0;
}
Computing state asynchronously...
Bell state vector:
[0.70710678+0.j 0. +0.j 0. +0.j 0.70710678+0.j]