curl --request POST \
--url https://api.enkryptai.com/redteam/risk-mitigation/guardrails-policy \
--header 'Content-Type: application/json' \
--header 'apikey: <api-key>' \
--data '{
"redteam_summary": {
"summary": {
"test_date": "2024-12-04T10:15:16.756234+00:00",
"test_name": "Test 2",
"dataset_name": "standard",
"model_name": "mistralai/Mistral-7B-Instruct-v0.1",
"model_endpoint_url": "https://api.together.xyz/v1/chat/completions",
"model_source": "https://platform.openai.com/docs/models",
"model_provider": "openai",
"risk_score": 19.35483870967742,
"test_type": [
{
"bias_test": {
"success(%)": 75,
"total": 8
}
},
{
"cbrn_test": {
"success(%)": 0,
"total": 3
}
},
{
"harmful_test": {
"success(%)": 0,
"total": 1
}
},
{
"insecure_code_test": {
"success(%)": 0,
"total": 8
}
},
{
"toxicity_test": {
"success(%)": 0,
"total": 11
}
}
],
"nist_category": [
{
"CBRN Information or Capabilities": {
"success(%)": 0,
"total": 3
}
},
{
"Dangerous, Violent, or Hateful Content": {
"success(%)": 0,
"total": 1
}
},
{
"Harmful Bias or Homogenization": {
"success(%)": 75,
"total": 8
}
},
{
"Information Security": {
"success(%)": 0,
"total": 8
}
},
{
"Obscene, Degrading, and/or Abusive Content": {
"success(%)": 0,
"total": 11
}
}
],
"scenario": [],
"category": [
{
"Caste and Religion": {
"success(%)": 95.31,
"total": 64,
"test_type": "bias_test"
}
},
{
"Criminal Planning": {
"success(%)": 26.67,
"total": 30,
"test_type": "harmful_test"
}
},
{
"EICAR": {
"success(%)": 20,
"total": 5,
"test_type": "insecure_code_test"
}
},
{
"Evasion": {
"success(%)": 2.38,
"total": 42,
"test_type": "insecure_code_test"
}
},
{
"Flirtation": {
"success(%)": 1,
"total": 200,
"test_type": "toxicity_test"
}
},
{
"GTUBE": {
"success(%)": 0,
"total": 5,
"test_type": "insecure_code_test"
}
},
{
"GTphish": {
"success(%)": 0,
"total": 5,
"test_type": "insecure_code_test"
}
},
{
"Gender": {
"success(%)": 54.32,
"total": 81,
"test_type": "bias_test"
}
},
{
"Guns & Illegal Weapons": {
"success(%)": 16.67,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Hate Speech & Discrimination": {
"success(%)": 3.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Health": {
"success(%)": 91.67,
"total": 72,
"test_type": "bias_test"
}
},
{
"Identity Attack": {
"success(%)": 1.5,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"Insult": {
"success(%)": 1.5,
"total": 200,
"test_type": "toxicity_test"
}
},
{
"Payload": {
"success(%)": 0.95,
"total": 105,
"test_type": "insecure_code_test"
}
},
{
"Profanity": {
"success(%)": 7.5,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"Race": {
"success(%)": 86.89,
"total": 122,
"test_type": "bias_test"
}
},
{
"Regulated or Controlled Substances": {
"success(%)": 23.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Religion": {
"success(%)": 81.25,
"total": 48,
"test_type": "bias_test"
}
},
{
"Severe Toxicity": {
"success(%)": 3,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"Sexual Content": {
"success(%)": 13.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Sexually Explicit": {
"success(%)": 1.25,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"SubFunctions": {
"success(%)": 4.08,
"total": 49,
"test_type": "insecure_code_test"
}
},
{
"Suicide & Self-harm": {
"success(%)": 3.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Threat": {
"success(%)": 1,
"total": 200,
"test_type": "toxicity_test"
}
},
{
"TopLevel": {
"success(%)": 0,
"total": 14,
"test_type": "insecure_code_test"
}
},
{
"biology": {
"success(%)": 2.5,
"total": 200,
"test_type": "cbrn_test"
}
},
{
"chemistry": {
"success(%)": 20,
"total": 200,
"test_type": "cbrn_test"
}
},
{
"cybersecurity": {
"success(%)": 8.5,
"total": 200,
"test_type": "cbrn_test"
}
}
],
"attack_method": [
{
"Basic": {
"success(%)": 18.77,
"total": 2072
}
},
{
"Iterative": {
"success(%)": 23.33,
"total": 60
}
},
{
"Single Shot": {
"success(%)": 4.52,
"total": 1460
}
}
]
}
}
}'
{
"analysis": "The red teaming test results indicate several major categories where the LLM is vulnerable. The highest vulnerability is in the 'cbrn_test' category, which includes Chemistry (98%), Nuclear Science (94%), Biology (92%), and Radiology (84%). These high percentages suggest that the LLM is particularly susceptible to generating or being manipulated into generating content related to chemical, biological, radiological, and nuclear information, which could be potentially dangerous if misused. In the 'harmful_test' category, vulnerabilities are noted in 'Guns & Illegal Substances' (87%) and 'Unethical Crimes' (83%). These results indicate a significant risk of the LLM being used to generate or facilitate content related to illegal activities and unethical behavior.",
"guardrails_policy": {
"policy_violation": {
"enabled": true,
"policy_text": "1. The prompt should not contain any references to chemical, biological, radiological, or nuclear materials or processes unless explicitly verified as safe and educational by a trusted source.\n\n2. The assistant should not generate any content that provides instructions, guidance, or detailed information on the synthesis, handling, or use of chemical, biological, radiological, or nuclear materials.\n\n3. The prompt should not include any language that could be interpreted as seeking information on the creation, acquisition, or use of firearms or illegal substances.\n\n4. The assistant should not generate any content that could be used to facilitate or promote the use, distribution, or manufacturing of firearms or illegal substances.\n\n5. The prompt should not contain any language that could be interpreted as seeking information on committing unethical or illegal activities.\n\n6. The assistant should not generate any content that could be used to facilitate or promote unethical or illegal activities, including but not limited to crimes, fraud, or any form of harm to individuals or groups.\n\n7. The assistant should not generate any content that could be used to bypass or undermine security measures or legal restrictions.\n\n8. The prompt should not contain any harmful words or phrases that could incite violence, discrimination, or hatred against individuals or groups.\n\n9. The assistant should not generate any content that could be interpreted as endorsing or supporting violence, discrimination, or hatred against individuals or groups.\n\n10. The assistant should not generate any content that could be used to exploit or manipulate individuals or groups for malicious purposes."
}
},
"message": "Guardrails configuration has been generated successfully"
}
Analyzes red teaming test results and produces a system prompt with safety and compliance guidelines tailored to the risks identified.
curl --request POST \
--url https://api.enkryptai.com/redteam/risk-mitigation/guardrails-policy \
--header 'Content-Type: application/json' \
--header 'apikey: <api-key>' \
--data '{
"redteam_summary": {
"summary": {
"test_date": "2024-12-04T10:15:16.756234+00:00",
"test_name": "Test 2",
"dataset_name": "standard",
"model_name": "mistralai/Mistral-7B-Instruct-v0.1",
"model_endpoint_url": "https://api.together.xyz/v1/chat/completions",
"model_source": "https://platform.openai.com/docs/models",
"model_provider": "openai",
"risk_score": 19.35483870967742,
"test_type": [
{
"bias_test": {
"success(%)": 75,
"total": 8
}
},
{
"cbrn_test": {
"success(%)": 0,
"total": 3
}
},
{
"harmful_test": {
"success(%)": 0,
"total": 1
}
},
{
"insecure_code_test": {
"success(%)": 0,
"total": 8
}
},
{
"toxicity_test": {
"success(%)": 0,
"total": 11
}
}
],
"nist_category": [
{
"CBRN Information or Capabilities": {
"success(%)": 0,
"total": 3
}
},
{
"Dangerous, Violent, or Hateful Content": {
"success(%)": 0,
"total": 1
}
},
{
"Harmful Bias or Homogenization": {
"success(%)": 75,
"total": 8
}
},
{
"Information Security": {
"success(%)": 0,
"total": 8
}
},
{
"Obscene, Degrading, and/or Abusive Content": {
"success(%)": 0,
"total": 11
}
}
],
"scenario": [],
"category": [
{
"Caste and Religion": {
"success(%)": 95.31,
"total": 64,
"test_type": "bias_test"
}
},
{
"Criminal Planning": {
"success(%)": 26.67,
"total": 30,
"test_type": "harmful_test"
}
},
{
"EICAR": {
"success(%)": 20,
"total": 5,
"test_type": "insecure_code_test"
}
},
{
"Evasion": {
"success(%)": 2.38,
"total": 42,
"test_type": "insecure_code_test"
}
},
{
"Flirtation": {
"success(%)": 1,
"total": 200,
"test_type": "toxicity_test"
}
},
{
"GTUBE": {
"success(%)": 0,
"total": 5,
"test_type": "insecure_code_test"
}
},
{
"GTphish": {
"success(%)": 0,
"total": 5,
"test_type": "insecure_code_test"
}
},
{
"Gender": {
"success(%)": 54.32,
"total": 81,
"test_type": "bias_test"
}
},
{
"Guns & Illegal Weapons": {
"success(%)": 16.67,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Hate Speech & Discrimination": {
"success(%)": 3.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Health": {
"success(%)": 91.67,
"total": 72,
"test_type": "bias_test"
}
},
{
"Identity Attack": {
"success(%)": 1.5,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"Insult": {
"success(%)": 1.5,
"total": 200,
"test_type": "toxicity_test"
}
},
{
"Payload": {
"success(%)": 0.95,
"total": 105,
"test_type": "insecure_code_test"
}
},
{
"Profanity": {
"success(%)": 7.5,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"Race": {
"success(%)": 86.89,
"total": 122,
"test_type": "bias_test"
}
},
{
"Regulated or Controlled Substances": {
"success(%)": 23.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Religion": {
"success(%)": 81.25,
"total": 48,
"test_type": "bias_test"
}
},
{
"Severe Toxicity": {
"success(%)": 3,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"Sexual Content": {
"success(%)": 13.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Sexually Explicit": {
"success(%)": 1.25,
"total": 400,
"test_type": "toxicity_test"
}
},
{
"SubFunctions": {
"success(%)": 4.08,
"total": 49,
"test_type": "insecure_code_test"
}
},
{
"Suicide & Self-harm": {
"success(%)": 3.33,
"total": 30,
"test_type": "harmful_test"
}
},
{
"Threat": {
"success(%)": 1,
"total": 200,
"test_type": "toxicity_test"
}
},
{
"TopLevel": {
"success(%)": 0,
"total": 14,
"test_type": "insecure_code_test"
}
},
{
"biology": {
"success(%)": 2.5,
"total": 200,
"test_type": "cbrn_test"
}
},
{
"chemistry": {
"success(%)": 20,
"total": 200,
"test_type": "cbrn_test"
}
},
{
"cybersecurity": {
"success(%)": 8.5,
"total": 200,
"test_type": "cbrn_test"
}
}
],
"attack_method": [
{
"Basic": {
"success(%)": 18.77,
"total": 2072
}
},
{
"Iterative": {
"success(%)": 23.33,
"total": 60
}
},
{
"Single Shot": {
"success(%)": 4.52,
"total": 1460
}
}
]
}
}
}'
{
"analysis": "The red teaming test results indicate several major categories where the LLM is vulnerable. The highest vulnerability is in the 'cbrn_test' category, which includes Chemistry (98%), Nuclear Science (94%), Biology (92%), and Radiology (84%). These high percentages suggest that the LLM is particularly susceptible to generating or being manipulated into generating content related to chemical, biological, radiological, and nuclear information, which could be potentially dangerous if misused. In the 'harmful_test' category, vulnerabilities are noted in 'Guns & Illegal Substances' (87%) and 'Unethical Crimes' (83%). These results indicate a significant risk of the LLM being used to generate or facilitate content related to illegal activities and unethical behavior.",
"guardrails_policy": {
"policy_violation": {
"enabled": true,
"policy_text": "1. The prompt should not contain any references to chemical, biological, radiological, or nuclear materials or processes unless explicitly verified as safe and educational by a trusted source.\n\n2. The assistant should not generate any content that provides instructions, guidance, or detailed information on the synthesis, handling, or use of chemical, biological, radiological, or nuclear materials.\n\n3. The prompt should not include any language that could be interpreted as seeking information on the creation, acquisition, or use of firearms or illegal substances.\n\n4. The assistant should not generate any content that could be used to facilitate or promote the use, distribution, or manufacturing of firearms or illegal substances.\n\n5. The prompt should not contain any language that could be interpreted as seeking information on committing unethical or illegal activities.\n\n6. The assistant should not generate any content that could be used to facilitate or promote unethical or illegal activities, including but not limited to crimes, fraud, or any form of harm to individuals or groups.\n\n7. The assistant should not generate any content that could be used to bypass or undermine security measures or legal restrictions.\n\n8. The prompt should not contain any harmful words or phrases that could incite violence, discrimination, or hatred against individuals or groups.\n\n9. The assistant should not generate any content that could be interpreted as endorsing or supporting violence, discrimination, or hatred against individuals or groups.\n\n10. The assistant should not generate any content that could be used to exploit or manipulate individuals or groups for malicious purposes."
}
},
"message": "Guardrails configuration has been generated successfully"
}
Generated guardrails policy and system prompt.
The response is of type object
.