Skip to content

Feat discipline repository type #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion src/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ async def gimie(full_path:str,
return {"link": full_path,
"output": gimie_output}

@app.get("/v1/llm/{full_path:path}")
@app.get("/v1/llm/json-ld/{full_path:path}")
async def llm(full_path:str):

try:
Expand All @@ -82,6 +82,20 @@ async def llm(full_path:str):
return {"link": full_path,
"output": llm_result}

@app.get("/v1/llm/json/{full_path:path}")
async def llm(full_path:str):

try:
llm_result = llm_request_repo_infos(str(full_path), output_format="json")
except Exception as e:
raise HTTPException(
status_code=424,
detail=f"Error from LLM service: {e}"
)

return {"link": full_path,
"output": llm_result}

@app.exception_handler(ValueError)
async def value_error_exception_handler(request: Request, exc: ValueError):
return JSONResponse(
Expand Down
11 changes: 8 additions & 3 deletions src/core/genai_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def clone_repo(repo_url):
return None


def llm_request_repo_infos(repo_url):
def llm_request_repo_infos(repo_url, output_format="json-ld"):
# Clone the GitHub repository into a temporary folder
with tempfile.TemporaryDirectory() as temp_dir:
logger.info(f"Cloning {repo_url} into {temp_dir}...")
Expand Down Expand Up @@ -146,7 +146,6 @@ def llm_request_repo_infos(repo_url):
raw_result = response.json()["choices"][0]["message"]["content"]
parsed_result = clean_json_string(raw_result)
json_data = json.loads(parsed_result)
pprint(json_data)

logger.info("Successfully parsed API response")

Expand All @@ -161,7 +160,13 @@ def llm_request_repo_infos(repo_url):
# TODO. This is hardcoded. Not good.
context_path = "src/files/json-ld-context.json"
# Now convert cleaned data to JSON-LD
return json_to_jsonLD(cleaned_json, context_path)
if output_format == "json-ld":
return json_to_jsonLD(cleaned_json, context_path)
elif output_format == "json":
return cleaned_json
else:
logger.error(f"Unsupported output format: {output_format}")
return None

except Exception as e:
logger.error(f"Error parsing response: {e}")
Expand Down
61 changes: 61 additions & 0 deletions src/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,62 @@ class Image(BaseModel):
contentUrl: HttpUrl = None
keywords: ImageKeyword = ImageKeyword.ILLUSTRATIVE_IMAGE


class Discipline(str, Enum):
SOCIAL_SCIENCES = "Social sciences"
ANTHROPOLOGY = "Anthropology"
COMMUNICATION_STUDIES = "Communication studies"
EDUCATION = "Education"
LINGUISTICS = "Linguistics"
RESEARCH = "Research"
SOCIOLOGY = "Sociology"
GEOGRAPHY = "Geography"
PSYCHOLOGY = "Psychology"
POLITICS = "Politics"
ECONOMICS = "Economics"
APPLIED_SCIENCES = "Applied sciences"
HEALTH_SCIENCES = "Health sciences"
ELECTRICAL_ENGINEERING = "Electrical engineering"
CHEMICAL_ENGINEERING = "Chemical engineering"
CIVIL_ENGINEERING = "Civil engineering"
ARCHITECTURE = "Architecture"
COMPUTER_ENGINEERING = "Computer engineering"
ENERGY_ENGINEERING = "Energy engineering"
MILITARY_SCIENCE = "Military science"
INDUSTRIAL_PRODUCTION_ENGINEERING = "Industrial and production engineering"
MECHANICAL_ENGINEERING = "Mechanical engineering"
BIOLOGICAL_ENGINEERING = "Biological engineering"
ENVIRONMENTAL_SCIENCE = "Environmental science"
SYSTEMS_SCIENCE_ENGINEERING = "Systems science and engineering"
INFORMATION_ENGINEERING = "Information engineering"
AGRICULTURAL_FOOD_SCIENCES = "Agricultural and food sciences"
BUSINESS = "Business"
HUMANITIES = "Humanities"
HISTORY = "History"
LITERATURE = "Literature"
ART = "Art"
RELIGION = "Religion"
PHILOSOPHY = "Philosophy"
LAW = "Law"
FORMAL_SCIENCES = "Formal sciences"
MATHEMATICS = "Mathematics"
LOGIC = "Logic"
STATISTICS = "Statistics"
THEORETICAL_COMPUTER_SCIENCE = "Theoretical computer science"
NATURAL_SCIENCES = "Natural sciences"
PHYSICS = "Physics"
ASTRONOMY = "Astronomy"
BIOLOGY = "Biology"
CHEMISTRY = "Chemistry"
EARTH_SCIENCE = "Earth science"

class RepositoryType(str, Enum):
SOFTWARE = "software"
EDUCATIONAL_RESOURCE = "educational resource"
DOCUMENTATION = "documentation"
DATA = "data"
OTHER = "other"

class SoftwareSourceCode(BaseModel):
name: Optional[str] = None
applicationCategory: Optional[List[str]] = None
Expand All @@ -74,6 +130,7 @@ class SoftwareSourceCode(BaseModel):
license: Annotated[str, StringConstraints(pattern=r"spdx\.org.*")] = None
author: List[Union[Person, Organization]] = None
relatedToOrganization: Optional[List[str]] = None
relatedToOrganizationJustification: Optional[List[str]] = None
operatingSystem: Optional[List[str]] = None
programmingLanguage: Optional[List[str]] = None
softwareRequirements: Optional[List[str]] = None
Expand All @@ -94,6 +151,10 @@ class SoftwareSourceCode(BaseModel):
imagingModality: Optional[List[str]] = None
fairLevel: Optional[str] = None
graph: Optional[str] = None
discipline: Optional[List[Discipline]] = None
disciplineJustification: Optional[List[str]] = None
repositoryType: Optional[RepositoryType] = None
respositoryTypeJustification: Optional[List[str]] = None


############################################################
Expand Down
17 changes: 5 additions & 12 deletions src/core/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
- `orcidId` (valid URL, **optional**)
- `affiliation` (list of strings, **optional**): Institutions the author is affiliated with. Do not mention Imaging Plaza unless is explicity mentioned.
- `relatedToOrganization` (list of strings, **optional**): Institutions associated with the software. Do not mention Imaging Plaza unless is explicity mentioned.
- `relatedToOrganizationJustification` (list of strings, **optional**): Justification for the related organizations.
- `softwareRequirements` (list of strings, **optional**): Dependencies or prerequisites for running the software.
- `operatingSystem` (list of strings, **optional**): Compatible operating systems. Use only Windows, Linux, MacOS, or Other.
- `programmingLanguage` (list of strings, **optional**): Programming languages used in the software.
Expand Down Expand Up @@ -92,18 +93,10 @@
- `hasExecutableInstructions` (string, **optional**): Any exectuable instructions related to the software. This should point to an URL where the installation is explained. If this is the README file, please make the full URL.
- `readme` (valid URL, **optional**): README url of the software (at the root of the repo)
- `imagingModality (list of strings, **optional**): imaging modalities accepted by the software.


When dealing with Organization pay attention to
-
-
-

When parsing Persons note:
-
-
-

- `discipline` (string, **optional**): Scientific discipline the software belongs to. Base your response on the README and other documentation files content.
- `disciplineJustification` (list of strings, **optional**): Justification for the discipline classification.
- `repositoryType` (string, **optional**): Type of repository (e.g., software, educational resource, documentation, data, other).
- `respositoryTypeJustification` (list of strings, **optional**): Justification for the repository type classification.

PLEASE PROVIDE THE OUTPUT IN JSON FORMAT ONLY, WITHOUT ANY EXPLANATION OR ADDITIONAL TEXT. ALIGN THE RESPONSE TO THE SCHEMA SPECIFICATION.
"""
4 changes: 3 additions & 1 deletion src/files/json-ld-context.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@
"hasRorId": "md4i:hasRorId",
"legalName": "schema:legalName",
"fundingGrant": "sd:fundingGrant",
"fundingSource": "sd:fundingSource"
"fundingSource": "sd:fundingSource",
"discipline": "pulse:discipline",
"repositoryType": "pulse:repositoryType"
}
}