1
Fork 0
mirror of https://github.com/Steffo99/unimore-bda-4.git synced 2024-11-21 23:54:19 +00:00

First commit

This commit is contained in:
Steffo 2022-11-25 09:31:19 +01:00
commit c0d502f50d
Signed by: steffo
GPG key ID: 2A24051445686895
34 changed files with 1449 additions and 0 deletions

6
.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
/data/cratesio
/data/neo4j/data
/data/neo4j/logs
/data/neo4j/run
/data/neo4j/plugins
/data/neo4j/import

8
.idea/.gitignore vendored Normal file
View file

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View file

@ -0,0 +1,9 @@
<component name="libraryTable">
<library name="apoc-full">
<CLASSES>
<root url="jar://$PROJECT_DIR$/data/neo4j/plugins/apoc-full.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>

10
.idea/misc.xml Normal file
View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DiscordProjectSettings">
<option name="show" value="ASK" />
<option name="description" value="" />
</component>
<component name="ProjectRootManager">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
.idea/modules.xml Normal file
View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/unimore-bda-4.iml" filepath="$PROJECT_DIR$/.idea/unimore-bda-4.iml" />
</modules>
</component>
</project>

9
.idea/unimore-bda-4.iml Normal file
View file

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

4
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,4 @@
{
"version": "0.2.0",
"configurations": []
}

15
.vscode/tasks.json vendored Normal file
View file

@ -0,0 +1,15 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "Neo4J",
"icon": {
"id": "database"
},
"type": "shell",
"command": "${workspaceFolder}/scripts/run-db.sh",
"problemMatcher": [],
"isBackground": true,
}
]
}

1041
README.md Normal file

File diff suppressed because it is too large Load diff

1
data/neo4j Symbolic link
View file

@ -0,0 +1 @@
/home/steffo/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-de822d9c-8f2e-4f04-9646-ac8f2fb719c6/

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

BIN
media/cratesio-keywords.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,6 @@
#!/usr/bin/env bash
export NEO4J_USERNAME="neo4j"
export NEO4J_PASSWORD="neo4j"
echo "Altering password..."
cypher-shell --database="system" --non-interactive --fail-fast 'ALTER CURRENT USER SET PASSWORD FROM "neo4j" TO "unimore-big-data-analytics-4"'

View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
repo=$(git rev-parse --show-toplevel)
unlink "$repo/data/neo4j"
ln -s "$1" "$repo/data/neo4j"
# Example call:
# ./create-neo4j-desktop-link.sh "/home/steffo/.config/Neo4j Desktop/Application/relate-data/dbmss/dbms-13367bfc-b56d-418c-a9bd-c8c3932e1e0e"

15
scripts/fixup-data-files.sh Executable file
View file

@ -0,0 +1,15 @@
#!/usr/bin/env bash
repo=$(git rev-parse --show-toplevel)
cwd=$(pwd)
data_files=$(ls $repo/data/cratesio/*/data/*.csv)
cd "$repo"
for file in $data_files; do
echo "Fixing data file $file..."
basefilename=$(basename $file)
sed --expression='s=\\=\\\\=g' $file > "$repo/data/neo4j/import/$basefilename"
done
cd "$cwd"

18
scripts/import-cratesio.sh Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -e
export NEO4J_USERNAME="neo4j"
export NEO4J_PASSWORD="unimore-big-data-analytics-4"
repo=$(git rev-parse --show-toplevel)
cwd=$(pwd)
import_scripts=$(echo $repo/scripts/import-cratesio/$1*.cypher | sort)
cd "$repo"
for file in $import_scripts; do
echo "Executing $file..."
cypher-shell --fail-at-end --format verbose < $file
done
cd "$cwd"

View file

@ -0,0 +1,66 @@
CREATE RANGE INDEX index_crate_id IF NOT EXISTS
FOR (crate:Crate)
ON (crate.id);
CREATE RANGE INDEX index_crate_downloads IF NOT EXISTS
FOR (crate:Crate)
ON (crate.downloads);
CREATE RANGE INDEX index_crate_created_at IF NOT EXISTS
FOR (crate:Crate)
ON (crate.created_at);
CREATE RANGE INDEX index_crate_updated_at IF NOT EXISTS
FOR (crate:Crate)
ON (crate.updated_at);
CREATE TEXT INDEX index_crate_name IF NOT EXISTS
FOR (crate:Crate)
ON (crate.name);
LOAD CSV WITH HEADERS FROM "file:///crates.csv" AS line FIELDTERMINATOR ","
CALL {
WITH line
MERGE (crate:Crate { id: toInteger(line.id) })
SET
crate.created_at = apoc.date.parse(line.created_at, "ms", "yyyy-MM-dd HH:mm:ss"),
crate.updated_at = apoc.date.parse(line.updated_at, "ms", "yyyy-MM-dd HH:mm:ss"),
crate.max_upload_size = toInteger(line.max_upload_size),
crate.downloads = toInteger(line.downloads),
crate.description = CASE trim(line.description)
WHEN ""
THEN null
ELSE
line.description
END,
crate.documentation = CASE trim(line.documentation)
WHEN ""
THEN null
ELSE
line.documentation
END,
crate.homepage = CASE trim(line.homepage)
WHEN ""
THEN null
ELSE
line.homepage
END,
crate.name = CASE trim(line.name)
WHEN ""
THEN null
ELSE
line.name
END,
crate.readme = CASE trim(line.readme)
WHEN ""
THEN null
ELSE
line.readme
END,
crate.repository = CASE trim(line.repository)
WHEN ""
THEN null
ELSE
line.repository
END
} IN TRANSACTIONS OF 10000 ROWS;

View file

@ -0,0 +1,13 @@
CREATE RANGE INDEX index_keyword_id IF NOT EXISTS
FOR (keyword:Keyword)
ON (keyword.id);
CREATE TEXT INDEX index_keyword_name IF NOT EXISTS
FOR (keyword:Keyword)
ON (keyword.name);
LOAD CSV WITH HEADERS FROM "file:///keywords.csv" AS line FIELDTERMINATOR ","
MERGE (keyword:Keyword { id: toInteger(line.id) })
SET
keyword.created_at = apoc.date.parse(line.created_at, "ms", "yyyy-MM-dd HH:mm:ss"),
keyword.name = line.keyword;

View file

@ -0,0 +1,8 @@
MATCH (:Crate)-[relation:IS_TAGGED_WITH]->(:Keyword)
DELETE relation;
LOAD CSV WITH HEADERS FROM "file:///crates_keywords.csv" AS line FIELDTERMINATOR ","
MATCH
(crate:Crate {id: toInteger(line.crate_id)}),
(keyword:Keyword {id: toInteger(line.keyword_id)})
CREATE (crate)-[:IS_TAGGED_WITH]->(keyword);

View file

@ -0,0 +1,50 @@
CREATE RANGE INDEX index_category_id IF NOT EXISTS
FOR (category:Category)
ON (category.id);
CREATE TEXT INDEX index_category_name IF NOT EXISTS
FOR (category:Category)
ON (category.name);
CREATE TEXT INDEX index_category_slug IF NOT EXISTS
FOR (category:Category)
ON (category.slug);
CREATE TEXT INDEX index_category_leaf IF NOT EXISTS
FOR (category:Category)
ON (category.leaf);
MATCH (category:Category)
DETACH DELETE category;
CREATE (
:Category {
name: "Root",
created_at: datetime(),
description: "Root category. Does not contain any category by itself.",
id: 0,
path: "root",
slug: "root"
}
);
LOAD CSV WITH HEADERS FROM "file:///categories.csv" AS line
CREATE (
:Category {
name: line.category,
created_at: apoc.date.parse(line.created_at, "ms", "yyyy-MM-dd HH:mm:ss"),
description: line.description,
id: toInteger(line.id),
path: line.path,
slug: line.slug
}
);
MATCH (c:Category)
WITH c, split(c.path, ".") AS path
SET c.leaf = path[-1];
MATCH (c:Category)
WITH c, split(c.path, ".") AS path
MATCH (d:Category {leaf: path[-2]})
CREATE (d)-[:CONTAINS]->(c);

View file

@ -0,0 +1,8 @@
MATCH (:Category)-[relation:CONTAINS]->(:Crate)
DELETE relation;
LOAD CSV WITH HEADERS FROM "file:///crates_categories.csv" AS line FIELDTERMINATOR ","
MATCH
(crate:Crate {id: toInteger(line.crate_id)}),
(category:Category {id: toInteger(line.category_id)})
CREATE (category)-[:CONTAINS]->(crate);

View file

@ -0,0 +1,23 @@
CREATE RANGE INDEX index_user_id IF NOT EXISTS
FOR (user:User)
ON (user.id);
CREATE RANGE INDEX index_user_ghid IF NOT EXISTS
FOR (user:User)
ON (user.gh_id);
CREATE TEXT INDEX index_user_name IF NOT EXISTS
FOR (user:User)
ON (user.name);
CREATE TEXT INDEX index_user_fullname IF NOT EXISTS
FOR (user:User)
ON (user.full_name);
LOAD CSV WITH HEADERS FROM "file:///users.csv" AS line FIELDTERMINATOR ","
MERGE (user:User { id: toInteger(line.id) })
SET
user.avatar = line.gh_avatar,
user.gh_id = toInteger(line.gh_id),
user.name = line.gh_login,
user.full_name = line.name;

View file

@ -0,0 +1,11 @@
MATCH (:User)-[owns:OWNS]->(:Crate)
DELETE owns;
LOAD CSV WITH HEADERS FROM "file:///crate_owners.csv" AS line FIELDTERMINATOR ","
MATCH (crate:Crate { id: toInteger(line.crate_id) })
MATCH (owner:User { id: toInteger(line.owner_id) })
CREATE (owner)-[ownership:OWNS {
created_at: apoc.date.parse(line.created_at, "ms", "yyyy-MM-dd HH:mm:ss"),
created_by: toInteger(line.created_by),
owner_kind: toInteger(line.owner_kind)
}]->(crate);

View file

@ -0,0 +1,50 @@
CREATE LOOKUP INDEX index_version_checksum IF NOT EXISTS
FOR (version:Version)
ON (version.checksum);
CREATE RANGE INDEX index_version_size IF NOT EXISTS
FOR (version:Version)
ON (version.size);
CREATE RANGE INDEX index_version_created_at IF NOT EXISTS
FOR (version:Version)
ON (version.created_at);
CREATE RANGE INDEX index_version_downloads IF NOT EXISTS
FOR (version:Version)
ON (version.downloads);
CREATE RANGE INDEX index_version_id IF NOT EXISTS
FOR (version:Version)
ON (version.id);
CREATE TEXT INDEX index_version_name IF NOT EXISTS
FOR (version:Version)
ON (version.name);
LOAD CSV WITH HEADERS FROM "file:///versions.csv" AS line FIELDTERMINATOR ","
CALL {
WITH line
MERGE (version:Version { id: toInteger(line.id) } )
SET
version.checksum = line.checksum,
version.size = toInteger(line.crate_size),
version.created_at = apoc.date.parse(line.created_at, "ms", "yyyy-MM-dd HH:mm:ss"),
version.downloads = toInteger(line.downloads),
version.license = line.license,
version.features = line.features,
version.links = line.links,
version.name = line.num,
version.is_yanked = CASE line.yanked
WHEN "t"
THEN true
ELSE
false
END
WITH line, version
MATCH (crate:Crate { id: toInteger(line.crate_id) })
MERGE (crate)-[:HAS_VERSION]->(version)
WITH line, version
MATCH (user:User { id: toInteger(line.published_by) })
MERGE (user)-[:PUBLISHED]->(version)
} IN TRANSACTIONS OF 10000 ROWS;

View file

@ -0,0 +1,38 @@
CREATE RANGE INDEX index_dependency_id IF NOT EXISTS
FOR ()-[dependency:DEPENDS_ON]->()
ON (dependency.id);
CREATE TEXT INDEX index_dependency_requirement IF NOT EXISTS
FOR ()-[dependency:DEPENDS_ON]->()
ON (dependency.requirement);
CREATE TEXT INDEX index_dependency_explicit_name IF NOT EXISTS
FOR ()-[dependency:DEPENDS_ON]->()
ON (dependency.explicit_name);
LOAD CSV WITH HEADERS FROM "file:///dependencies.csv" AS line FIELDTERMINATOR ","
CALL {
WITH line
MATCH
(version:Version { id: toInteger(line.version_id) }),
(requirement:Crate { id: toInteger(line.crate_id) })
MERGE (version)-[dependency:DEPENDS_ON]->(requirement)
SET
dependency.id = line.id,
dependency.is_optional = CASE line.optional
WHEN "t"
THEN true
ELSE
false
END,
dependency.is_default = CASE line.default_features
WHEN "t"
THEN true
ELSE
false
END,
dependency.explicit_name = line.explicit_name,
dependency.features = line.features,
dependency.requirement = line.req,
dependency.target = line.target
} IN TRANSACTIONS OF 10000 ROWS;

4
scripts/run-db.sh Executable file
View file

@ -0,0 +1,4 @@
#!/usr/bin/env bash
repo=$(git rev-parse --show-toplevel)
export NEO4J_HOME="$repo/data/neo4j"
neo4j console

9
scripts/setup-apoc.sh Executable file
View file

@ -0,0 +1,9 @@
#!/usr/bin/env bash
repo=$(git rev-parse --show-toplevel)
echo "Creating plugins directory..."
mkdir --parents "$repo/data/neo4j/plugins"
echo "Installing Neo4j Apoc..."
wget 'https://github.com/neo4j/apoc/releases/download/5.5.0/apoc-5.5.0-core.jar' --output-document="$repo/data/neo4j/plugins/apoc-core.jar"