Spaces:

GIZ
/

Development-Project-Synergy-Finder

Sleeping

App Files Files Community

Jan Mühlnikel commited on Apr 12

Commit

f3a1940

•

1 Parent(s): 39b49f4

test

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +36 -0
.streamlit/config.toml +14 -0
README.md +13 -0
__pycache__/app.cpython-310.pyc +0 -0
__pycache__/crs.cpython-310.pyc +0 -0
__pycache__/home.cpython-310.pyc +0 -0
__pycache__/sector.cpython-310.pyc +0 -0
__pycache__/similarity.cpython-310.pyc +0 -0
__pycache__/similarity_page.cpython-310.pyc +0 -0
functions/__pycache__/calc_matches.cpython-310.pyc +0 -0
functions/__pycache__/filter_projects.cpython-310.pyc +0 -0
functions/__pycache__/semantic_search.cpython-310.pyc +0 -0
functions/__pycache__/single_similar.cpython-310.pyc +0 -0
functions/calc_matches.py +37 -0
functions/different_orga_filter.py +12 -0
functions/filter_single.py +22 -0
functions/same_country_filter.py +16 -0
functions/semantic_search.py +27 -0
functions/single_similar.py +25 -0
modules/__pycache__/crs_table.cpython-310.pyc +0 -0
modules/__pycache__/filter_modules.cpython-310.pyc +0 -0
modules/__pycache__/filter_projects.cpython-310.pyc +0 -0
modules/__pycache__/navbar.cpython-310.pyc +0 -0
modules/__pycache__/result_table.cpython-310.pyc +0 -0
modules/__pycache__/sdg_table.cpython-310.pyc +0 -0
modules/__pycache__/semantic_search.cpython-310.pyc +0 -0
modules/__pycache__/similarity_table.cpython-310.pyc +0 -0
modules/multimatch_result_table.py +134 -0
requirements.txt +2 -1
similarity_page.py +1 -3
src/codelists/country_codes_ISO3166-1alpha-2.csv +3 -0
src/codelists/crs3_codes.csv +3 -0
src/codelists/crs5_codes.csv +3 -0
src/codelists/flags/AC.png +0 -0
src/codelists/flags/AD.png +0 -0
src/codelists/flags/AE.png +0 -0
src/codelists/flags/AF.png +0 -0
src/codelists/flags/AG.png +0 -0
src/codelists/flags/AI-alt.png +0 -0
src/codelists/flags/AI.png +0 -0
src/codelists/flags/AL.png +0 -0
src/codelists/flags/AM.png +0 -0
src/codelists/flags/AO.png +0 -0
src/codelists/flags/AQ.png +0 -0
src/codelists/flags/AR.png +0 -0
src/codelists/flags/AS.png +0 -0
src/codelists/flags/AT.png +0 -0
src/codelists/flags/AU.png +0 -0
src/codelists/flags/AW.png +0 -0
src/codelists/flags/AX.png +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[global]
+[server]
+headless = true
+[client]
+initialSidebarState = "expanded"
+[theme]
+primaryColor="#c30f08"
+backgroundColor="#ffffff"
+secondaryBackgroundColor="#eceded"
+textColor="#000000"
+font="sans serif"

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Development Banks Collaboration Analyzer
+emoji: 🐢
+colorFrom: pink
+colorTo: red
+sdk: streamlit
+sdk_version: 1.32.2
+app_file: app.py
+pinned: true
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (664 Bytes). View file

__pycache__/crs.cpython-310.pyc ADDED Viewed

Binary file (3.71 kB). View file

__pycache__/home.cpython-310.pyc ADDED Viewed

Binary file (447 Bytes). View file

__pycache__/sector.cpython-310.pyc ADDED Viewed

Binary file (6.06 kB). View file

__pycache__/similarity.cpython-310.pyc ADDED Viewed

Binary file (3.66 kB). View file

__pycache__/similarity_page.cpython-310.pyc ADDED Viewed

Binary file (9.48 kB). View file

functions/__pycache__/calc_matches.cpython-310.pyc ADDED Viewed

Binary file (922 Bytes). View file

functions/__pycache__/filter_projects.cpython-310.pyc ADDED Viewed

Binary file (1.81 kB). View file

functions/__pycache__/semantic_search.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

functions/__pycache__/single_similar.cpython-310.pyc ADDED Viewed

Binary file (672 Bytes). View file

functions/calc_matches.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import pandas as pd
+import numpy as np
+def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
+    # matching project2 can be nay project
+    # indecies (rows) = project1
+    # columns = project2
+    # -> find matches
+    # filter out all row considering the filter
+    filtered_df_indecies_list = filtered_df.index
+    project_df_indecies_list = project_df.index
+    np.fill_diagonal(similarity_matrix, 0)
+    match_matrix = similarity_matrix[filtered_df_indecies_list, :][:, project_df_indecies_list]
+    best_matches_list = np.argsort(match_matrix, axis=None)
+    if len(best_matches_list) < top_x:
+        top_x = len(best_matches_list)
+    # get row (project1) and column (project2) with highest similarity in filtered df
+    top_indices = np.unravel_index(best_matches_list[-top_x:], match_matrix.shape)
+    # get the corresponding similarity values
+    top_values = match_matrix[top_indices]
+    p1_df = filtered_df.iloc[top_indices[0]]
+    p1_df["similarity"] = top_values
+    p2_df = project_df.iloc[top_indices[1]]
+    p2_df["similarity"] = top_values
+    return p1_df, p2_df

functions/different_orga_filter.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import pandas as pd
+def different_orga_filter(df, orga):
+    # FILTER COUNTRY
+        country_filtered_df = pd.DataFrame()
+        for c in country_code_list:
+            c_df = df[df["country"].str.contains(c, na=False)]
+            country_filtered_df = pd.concat([country_filtered_df, c_df], ignore_index=False)
+        df = country_filtered_df
+        return country_filtered_df

functions/filter_single.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import pandas as pd
+from functions.semantic_search import search
+def contains_code(crs_codes, code_list):
+    codes = str(crs_codes).split(';')
+    return any(code in code_list for code in codes)
+def filter_single(df, country_code_list, orga_code_list):
+    # FILTER COUNTRY
+    if country_code_list != []:
+        country_filtered_df = pd.DataFrame()
+        for c in country_code_list:
+            c_df = df[df["country"].str.contains(c, na=False)]
+            country_filtered_df = pd.concat([country_filtered_df, c_df], ignore_index=False)
+        df = country_filtered_df
+    # FILTER ORGANIZATION
+    if orga_code_list != []:
+        df = df[df['orga_abbreviation'].isin(orga_code_list)]
+    return df

functions/same_country_filter.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import pandas as pd
+from functions.semantic_search import search
+def same_country_filter(df, country_code_list):
+    # FILTER COUNTRY
+        if country_code_list != []:
+            country_filtered_df = pd.DataFrame()
+            for c in country_code_list:
+                c_df = df[df["country"].str.contains(c, na=False)]
+                country_filtered_df = pd.concat([country_filtered_df, c_df], ignore_index=False)
+            df = country_filtered_df
+            return country_filtered_df
+        else:
+            return df

functions/semantic_search.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import pickle
+import faiss
+import streamlit as st
+from sentence_transformers import SentenceTransformer
+import pandas as pd
+def search(query, model, embeddings, filtered_df, top_x=20):
+        filtered_df_indecies_list = filtered_df.index
+        filtered_embeddings = embeddings[filtered_df_indecies_list]
+        # Load or create FAISS index
+        dimension = filtered_embeddings.shape[1]
+        faiss_index = faiss.IndexFlatL2(dimension)
+        faiss_index.add(filtered_embeddings)
+        # Convert query to embedding
+        query_embedding = model.encode([query])[0].reshape(1, -1)
+        # Perform search
+        D, I = faiss_index.search(query_embedding, k=top_x)  # Search for top x similar items
+        # Extract the sentences corresponding to the top indices
+        #print(filtered_df.columns())
+        top_indecies = [i for i in I[0]]
+        return filtered_df.iloc[top_indecies]

functions/single_similar.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import pandas as pd
+import numpy as np
+def find_similar(p_index, similarity_matrix, filtered_df, top_x):
+    # filter out just projects from filtered df
+    filtered_indices = filtered_df.index.tolist()
+    index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}
+    filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]
+    # filter out the row of the selected poject
+    project_row = filtered_column_sim_matrix[p_index]
+    sorted_indices = np.argsort(project_row)
+    top_10_indices_descending = sorted_indices[-10:][::-1]
+    #top_10_original_indices = [index_position_mapping[position] for position in top_10_indices_descending]
+    top_10_values_descending = project_row[top_10_indices_descending]
+    result_df = filtered_df.iloc[top_10_indices_descending]
+    result_df["similarity"] = top_10_values_descending
+    return result_df

modules/__pycache__/crs_table.cpython-310.pyc ADDED Viewed

Binary file (1.21 kB). View file

modules/__pycache__/filter_modules.cpython-310.pyc ADDED Viewed

Binary file (997 Bytes). View file

modules/__pycache__/filter_projects.cpython-310.pyc ADDED Viewed

Binary file (979 Bytes). View file

modules/__pycache__/navbar.cpython-310.pyc ADDED Viewed

Binary file (784 Bytes). View file

modules/__pycache__/result_table.cpython-310.pyc ADDED Viewed

Binary file (2.65 kB). View file

modules/__pycache__/sdg_table.cpython-310.pyc ADDED Viewed

Binary file (1.19 kB). View file

modules/__pycache__/semantic_search.cpython-310.pyc ADDED Viewed

Binary file (1.17 kB). View file

modules/__pycache__/similarity_table.cpython-310.pyc ADDED Viewed

Binary file (1.41 kB). View file

modules/multimatch_result_table.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import streamlit as st
+import pandas as pd
+def show_multi_table(p1_df, p2_df):
+        st.write("------------------")
+        p1_df = p1_df.reset_index(drop=True)
+        p2_df = p2_df.reset_index(drop=True)
+        actual_ind = 0
+        for i in range(len(p1_df) - 1, -1, -2): # stepsize because project matchs in both ways and it should only display a match one time
+            actual_ind += 1
+            match_df = pd.DataFrame()
+            row_from_p1 = p1_df.iloc[[i]]
+            row_from_p2 = p2_df.iloc[[i]]
+            # INTEGRATE IN PREPROCESSING !!!
+            # transform strings to list
+            try:
+                row_from_p1["crs_3_code_list"] = [row_from_p1['crs_3_name'].item().split(";")[:-1]]
+                row_from_p2["crs_3_code_list"] = [row_from_p2['crs_3_name'].item().split(";")[:-1]]
+            except:
+                row_from_p1["crs_3_code_list"] = [""]
+                row_from_p2["crs_3_code_list"] = [""]
+            try:
+                row_from_p1["crs_5_code_list"] = [row_from_p1['crs_5_name'].item().split(";")[:-1]]
+                row_from_p2["crs_5_code_list"] = [row_from_p2['crs_5_name'].item().split(";")[:-1]]
+            except:
+                row_from_p1["crs_5_code_list"] = [""]
+                row_from_p2["crs_5_code_list"] = [""]
+            row_from_p1["sdg_list"] = [row_from_p1['sgd_pred_code'].item()]
+            row_from_p2["sdg_list"] = [row_from_p2['sgd_pred_code'].item()]
+            try:
+                row_from_p1["flag"] = f"https://flagicons.lipis.dev/flags/4x3/{row_from_p1['country'].item()[:2].lower()}.svg"
+                row_from_p2["flag"] = f"https://flagicons.lipis.dev/flags/4x3/{row_from_p2['country'].item()[:2].lower()}.svg"
+            except:
+                row_from_p1["flag"] = "https://flagicons.lipis.dev/flags/4x3/xx.svg"
+                row_from_p2["flag"] = "https://flagicons.lipis.dev/flags/4x3/xx.svg"
+            #print(row_from_p1["flag"].item())
+            # Correctly append rows to match_df
+            #st.subheader(f"#{actual_ind}")
+            #st.caption(f"Similarity: {round(row_from_p1['similarity'].item(), 4) * 100}%")
+            match_df = pd.concat([row_from_p1, row_from_p2], ignore_index=True)
+            col1, col2 = st.columns([1, 12])
+            with col1:
+                # remove arrow from standart st.metric()
+                st.write(
+                    """
+                    <style>
+                    [data-testid="stMetricDelta"] svg {
+                        display: none;
+                    }
+                    </style>
+                    """,
+                    unsafe_allow_html=True,
+                )
+                st.metric(label="Match", value=f"{actual_ind}", delta=f"~ {str(round(row_from_p1['similarity'].item(), 5) * 100)[:4]} %")
+            with col2:
+                st.write("    ")
+                st.dataframe(
+                    match_df[["iati_id", "title_main", "orga_abbreviation", "client", "description_main", "country_name", "flag", "sdg_list", "crs_3_code_list", "crs_5_code_list"]],
+                    use_container_width = True,
+                    height = 35 + 35 * len(match_df),
+                    column_config={
+                        "iati_id": st.column_config.TextColumn(
+                            "IATI ID",
+                            help="IATI Project ID",
+                            disabled=True,
+                            width="small"
+                        ),
+                        "orga_abbreviation": st.column_config.TextColumn(
+                            "Organization",
+                            help="If description not in English, description in other language provided",
+                            disabled=True,
+                            width="small"
+                        ),
+                        "client": st.column_config.TextColumn(
+                            "Client",
+                            help="Client organization of customer",
+                            disabled=True,
+                            width="small"
+                        ),
+                        "title_main": st.column_config.TextColumn(
+                            "Title",
+                            help="If title not in English, title in other language provided",
+                            disabled=True,
+                            width="large"
+                        ),
+                        "description_main": st.column_config.TextColumn(
+                            "Description",
+                            help="If description not in English, description in other language provided",
+                            disabled=True,
+                            width="large"
+                        ),
+                        "country_name": st.column_config.TextColumn(
+                            "Country",
+                            help="Country of project",
+                            disabled=True,
+                            width="small"
+                        ),
+                        "flag": st.column_config.ImageColumn(
+                            "Flag",
+                            help="country flag",
+                            width="small"
+                        ),
+                        "sdg_list": st.column_config.ListColumn(
+                            "SDG Prediction",
+                            help="Prediction of SDG's",
+                            width="small"
+                        ),
+                        "crs_3_code_list": st.column_config.ListColumn(
+                            "CRS 3",
+                            help="CRS 3 code given by organization",
+                            width="medium"
+                        ),
+                        "crs_5_code_list": st.column_config.ListColumn(
+                            "CRS 5",
+                            help="CRS 5 code given by organization",
+                            width="medium"
+                        ),
+                    },
+                    hide_index=True,
+                )
+            st.write("------------------")

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ scipy==1.12.0
 faiss-cpu==1.8.0
 faiss-gpu==1.7.2
 sentence-transformers==2.5.1
-streamlit-aggrid==0.3.4.

 faiss-cpu==1.8.0
 faiss-gpu==1.7.2
 sentence-transformers==2.5.1
+streamlit-aggrid==0.3.4.
+psutil==5.9.0

similarity_page.py CHANGED Viewed

@@ -17,15 +17,13 @@ from functions.filter_single import filter_single
 from functions.calc_matches import calc_matches
 from functions.same_country_filter import same_country_filter
 from functions.single_similar import find_similar
-#import psutil
 import os
 import gc
-"""
 def get_process_memory():
     process = psutil.Process(os.getpid())
     return process.memory_info().rss / (1024 * 1024)
-"""
 # Catch DATA
 # Load Similarity matrix

 from functions.calc_matches import calc_matches
 from functions.same_country_filter import same_country_filter
 from functions.single_similar import find_similar
+import psutil
 import os
 import gc
 def get_process_memory():
     process = psutil.Process(os.getpid())
     return process.memory_info().rss / (1024 * 1024)
 # Catch DATA
 # Load Similarity matrix