blob: aa9bf37384e3be175d508d8e1e50b2affd3a9d45 [file] [log] [blame]
use crate::dedupe::DedupeContext;
use crate::{Crate, CratesIterError, Error, IndexConfig};
use git2::Repository;
use std::fmt;
use std::{
io,
path::{Path, PathBuf},
};
/// Wrapper around managing the crates.io-index git repository
///
/// Uses a "bare" git index that fetches files directly from the repo instead of local checkout.
/// Uses Cargo's cache.
pub struct Index {
path: PathBuf,
url: String,
repo: git2::Repository,
head: git2::Oid,
head_str: String,
}
impl Index {
#[doc(hidden)]
#[deprecated(note = "use new_cargo_default()")]
pub fn new<P: Into<PathBuf>>(path: P) -> Self {
Self::from_path_and_url(path.into(), crate::INDEX_GIT_URL.into()).unwrap()
}
/// Creates an index for the default crates.io registry, using the same
/// disk location as Cargo itself.
///
/// This is the recommended way to access Cargo's index.
#[inline]
pub fn new_cargo_default() -> Result<Self, Error> {
Self::from_url(crate::INDEX_GIT_URL)
}
/// Creates a bare index from a provided URL, opening the same location on
/// disk that Cargo uses for that registry index.
///
/// It can be used to access custom registries.
pub fn from_url(url: &str) -> Result<Self, Error> {
let (dir_name, canonical_url) = url_to_local_dir(url)?;
let mut path = home::cargo_home().unwrap_or_default();
path.push("registry/index");
path.push(dir_name);
Self::from_path_and_url(path, canonical_url)
}
/// Creates a bare index at the provided path with the specified repository URL.
#[inline]
pub fn with_path<P: Into<PathBuf>, S: Into<String>>(path: P, url: S) -> Result<Self, Error> {
Self::from_path_and_url(path.into(), url.into())
}
/// Get the index directory.
#[inline]
pub fn path(&self) -> &Path {
&self.path
}
/// Get the index url.
#[inline]
#[must_use]
pub fn url(&self) -> &str {
&self.url
}
}
impl Index {
fn from_path_and_url(path: PathBuf, url: String) -> Result<Self, Error> {
let exists = git2::Repository::discover(&path)
.map(|repository| {
repository
.find_remote("origin")
.ok()
// Cargo creates a checkout without an origin set,
// so default to true in case of missing origin
.map_or(true, |remote| remote.url().map_or(true, |u| u == url))
})
.unwrap_or(false);
let repo = if !exists {
let mut opts = git2::RepositoryInitOptions::new();
opts.external_template(false);
let repo = git2::Repository::init_opts(&path, &opts)?;
{
let mut origin_remote = repo
.find_remote("origin")
.or_else(|_| repo.remote_anonymous(&url))?;
origin_remote.fetch(
&[
"HEAD:refs/remotes/origin/HEAD",
"master:refs/remotes/origin/master",
],
Some(&mut crate::fetch_opts()),
None,
)?;
}
repo
} else {
git2::Repository::open(&path)?
};
let head = repo
// Fallback to HEAD, as a fresh clone won't have a FETCH_HEAD
.refname_to_id("FETCH_HEAD")
.or_else(|_| repo.refname_to_id("HEAD"))?;
let head_str = head.to_string();
Ok(Self {
path,
url,
head_str,
repo,
head,
})
}
fn tree(&self) -> Result<git2::Tree<'_>, git2::Error> {
let commit = self.repo.find_commit(self.head)?;
commit.tree()
}
#[doc(hidden)]
#[deprecated(note = "use update()")]
pub fn retrieve_or_update(&mut self) -> Result<(), Error> {
self.update()
}
#[doc(hidden)]
#[deprecated(note = "it's always retrieved. there's no need to call it any more")]
pub fn retrieve(&self) -> Result<(), Error> {
Ok(())
}
#[doc(hidden)]
#[deprecated(note = "it's always retrieved, so it's assumed to always exist")]
pub fn exists(&self) -> bool {
true
}
/// Fetches latest from the remote index repository. Note that using this
/// method will mean no cache entries will be used, if a new commit is fetched
/// from the repository, as their commit version will no longer match.
pub fn update(&mut self) -> Result<(), Error> {
{
let mut origin_remote = self
.repo
.find_remote("origin")
.or_else(|_| self.repo.remote_anonymous(&self.url))?;
origin_remote.fetch(
&[
"HEAD:refs/remotes/origin/HEAD",
"master:refs/remotes/origin/master",
],
Some(&mut crate::fetch_opts()),
None,
)?;
}
let head = self
.repo
.refname_to_id("FETCH_HEAD")
.or_else(|_| self.repo.refname_to_id("HEAD"))?;
self.head = head;
self.head_str = self.head.to_string();
Ok(())
}
/// Reads a crate from the index, it will attempt to use a cached entry if
/// one is available, otherwise it will fallback to reading the crate
/// directly from the git blob containing the crate information.
///
/// Use this only if you need to get very few crates. If you're going
/// to read majority of crates, prefer the [`Index::crates()`] iterator.
pub fn crate_(&self, name: &str) -> Option<Crate> {
let rel_path = crate::crate_name_to_relative_path(name)?;
// Attempt to load the .cache/ entry first, this is purely an acceleration
// mechanism and can fail for a few reasons that are non-fatal
{
// avoid realloc on each push
let mut cache_path = PathBuf::with_capacity(path_max_byte_len(&self.path) + 8 + rel_path.len());
cache_path.push(&self.path);
cache_path.push(".cache");
cache_path.push(&rel_path);
if let Ok(cache_bytes) = std::fs::read(&cache_path) {
if let Ok(krate) = Crate::from_cache_slice(&cache_bytes, &self.head_str) {
return Some(krate);
}
}
}
// Fallback to reading the blob directly via git if we don't have a
// valid cache entry
self.crate_from_rel_path(&rel_path).ok()
}
fn crate_from_rel_path(&self, path: &str) -> Result<Crate, Error> {
let entry = self.tree()?.get_path(Path::new(path))?;
let object = entry.to_object(&self.repo)?;
let blob = object
.as_blob()
.ok_or_else(|| Error::Io(io::Error::new(io::ErrorKind::NotFound, path.to_owned())))?;
Crate::from_slice(blob.content()).map_err(Error::Io)
}
/// Single-threaded iterator over all the crates in the index.
///
/// [`Index::crates_parallel`] is typically 3 times faster.
///
/// Skips crates that can not be parsed (but there shouldn't be any such crates in the crates-io index).
#[inline]
pub fn crates(&self) -> Crates<'_> {
Crates {
blobs: self.crates_refs().expect("HEAD commit disappeared"),
dedupe: MaybeOwned::Owned(DedupeContext::new()),
}
}
/// Iterate over all crates using rayon.
///
/// This method is available only if the "parallel" feature is enabled.
#[cfg(feature = "parallel")]
pub fn crates_parallel(&self) -> impl rayon::iter::ParallelIterator<Item=Result<Crate, CratesIterError>> + '_ {
use rayon::iter::{IntoParallelIterator, ParallelIterator, IndexedParallelIterator};
let tree_oids = match self.crates_top_level_refs() {
Ok(objs) => objs.into_iter().map(|obj| obj.id()).collect::<Vec<_>>(),
Err(_) => vec![git2::Oid::zero()], // intentionally broken oid to return error from the iterator
};
let path = self.repo.path();
tree_oids.into_par_iter()
.with_min_len(64)
.map_init(
move || (Repository::open_bare(&path), DedupeContext::new()),
|(repo, ctx), oid| {
let repo = match repo.as_ref() {
Ok(repo) => repo,
Err(_) => return vec![Err(CratesIterError)],
};
let mut stack = Vec::with_capacity(64);
match repo.find_object(oid, None) {
Ok(obj) => stack.push(obj),
Err(_) => return vec![Err(CratesIterError)],
};
let blobs = CratesRefs { stack, repo };
Crates {
blobs,
dedupe: MaybeOwned::Borrowed(ctx),
}
.map(Ok)
.collect::<Vec<_>>()
},
)
.flat_map_iter(|chunk| chunk.into_iter())
}
/// update an iterator over all the crates in the index.
/// Returns opaque reference for each crate in the index, which can be used with [`CrateRef::parse`]
pub(crate) fn crates_refs(&self) -> Result<CratesRefs<'_>, git2::Error> {
Ok(CratesRefs {
stack: self.crates_top_level_refs()?,
repo: &self.repo,
})
}
pub(crate) fn crates_top_level_refs(&self) -> Result<Vec<git2::Object<'_>>, git2::Error> {
let mut stack = Vec::with_capacity(800);
for entry in self.tree()?.iter() {
// crates are in short dirs, skip .git/.cache
if entry.name_bytes().len() <= 2 {
let entry = entry.to_object(&self.repo)?;
// Scan only directories at top level
if entry.as_tree().is_some() {
stack.push(entry);
}
}
}
Ok(stack)
}
/// Get the global configuration of the index.
pub fn index_config(&self) -> Result<IndexConfig, Error> {
let entry = self.tree()?.get_path(Path::new("config.json"))?;
let object = entry.to_object(&self.repo)?;
let blob = object
.as_blob()
.ok_or_else(|| Error::Io(io::Error::new(io::ErrorKind::NotFound, "config.json")))?;
serde_json::from_slice(blob.content()).map_err(Error::Json)
}
}
#[cfg(unix)]
fn path_max_byte_len(path: &Path) -> usize {
use std::os::unix::prelude::OsStrExt;
path.as_os_str().as_bytes().len()
}
#[cfg(not(unix))]
fn path_max_byte_len(path: &Path) -> usize {
path.to_str().map_or(0, |p| p.len())
}
/// Iterator over all crates in the index, but returns opaque objects that can be parsed separately.
///
/// See [`CrateRef::parse`].
pub(crate) struct CratesRefs<'a> {
stack: Vec<git2::Object<'a>>,
repo: &'a git2::Repository,
}
/// Opaque representation of a crate in the index. See [`CrateRef::parse`].
pub(crate) struct CrateRef<'a>(git2::Object<'a>);
impl CrateRef<'_> {
#[inline]
/// Parse a crate from [`Index::crates_blobs`] iterator
pub fn parse(&self, ctx: &mut DedupeContext) -> io::Result<Crate> {
let blob = self.as_slice().ok_or(io::ErrorKind::InvalidData)?;
Crate::from_slice_with_context(blob, ctx)
}
/// Raw crate data that can be parsed with [`Crate::from_slice`]
pub fn as_slice(&self) -> Option<&[u8]> {
Some(self.0.as_blob()?.content())
}
}
impl<'a> Iterator for CratesRefs<'a> {
type Item = CrateRef<'a>;
fn next(&mut self) -> Option<Self::Item> {
while let Some(last) = self.stack.pop() {
match last.as_tree() {
None => return Some(CrateRef(last)),
Some(tree) => {
for entry in tree.iter().rev() {
self.stack.push(entry.to_object(self.repo).unwrap());
}
continue;
}
}
}
None
}
}
impl fmt::Debug for CrateRef<'_> {
#[cold]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("CrateRef")
.field("oid", &self.0.id())
.finish()
}
}
enum MaybeOwned<'a, T> {
Owned(T),
Borrowed(&'a mut T),
}
/// Iterator over all crates in the index. Skips crates that failed to parse.
pub struct Crates<'a> {
blobs: CratesRefs<'a>,
dedupe: MaybeOwned<'a, DedupeContext>,
}
impl<'a> Iterator for Crates<'a> {
type Item = Crate;
fn next(&mut self) -> Option<Self::Item> {
for next in self.blobs.by_ref() {
let dedupe = match &mut self.dedupe {
MaybeOwned::Owned(d) => d,
MaybeOwned::Borrowed(d) => d,
};
if let Ok(k) = CrateRef::parse(&next, dedupe) {
return Some(k);
}
}
None
}
}
/// Converts a full url, eg https://github.com/rust-lang/crates.io-index, into
/// the root directory name where cargo itself will fetch it on disk
fn url_to_local_dir(url: &str) -> Result<(String, String), Error> {
fn to_hex(num: u64) -> String {
const CHARS: &[u8] = b"0123456789abcdef";
let bytes = &[
num as u8,
(num >> 8) as u8,
(num >> 16) as u8,
(num >> 24) as u8,
(num >> 32) as u8,
(num >> 40) as u8,
(num >> 48) as u8,
(num >> 56) as u8,
];
let mut output = vec![0u8; 16];
let mut ind = 0;
for &byte in bytes {
output[ind] = CHARS[(byte >> 4) as usize];
output[ind + 1] = CHARS[(byte & 0xf) as usize];
ind += 2;
}
String::from_utf8(output).expect("valid utf-8 hex string")
}
#[allow(deprecated)]
fn hash_u64(url: &str) -> u64 {
use std::hash::{Hash, Hasher, SipHasher};
let mut hasher = SipHasher::new_with_keys(0, 0);
// Registry
2usize.hash(&mut hasher);
// Url
url.hash(&mut hasher);
hasher.finish()
}
// Ensure we have a registry or bare url
let (url, scheme_ind) = {
let scheme_ind = url
.find("://")
.ok_or_else(|| Error::Url(format!("'{}' is not a valid url", url)))?;
let scheme_str = &url[..scheme_ind];
if let Some(ind) = scheme_str.find('+') {
if &scheme_str[..ind] != "registry" {
return Err(Error::Url(format!("'{}' is not a valid registry url", url)));
}
(&url[ind + 1..], scheme_ind - ind - 1)
} else {
(url, scheme_ind)
}
};
// Could use the Url crate for this, but it's simple enough and we don't
// need to deal with every possible url (I hope...)
let host = match url[scheme_ind + 3..].find('/') {
Some(end) => &url[scheme_ind + 3..scheme_ind + 3 + end],
None => &url[scheme_ind + 3..],
};
// cargo special cases github.com for reasons, so do the same
let mut canonical = if host == "github.com" {
url.to_lowercase()
} else {
url.to_owned()
};
// Chop off any query params/fragments
if let Some(hash) = canonical.rfind('#') {
canonical.truncate(hash);
}
if let Some(query) = canonical.rfind('?') {
canonical.truncate(query);
}
let ident = to_hex(hash_u64(&canonical));
if canonical.ends_with('/') {
canonical.pop();
}
if canonical.contains("github.com/") && canonical.ends_with(".git") {
// Only GitHub (crates.io) repositories have their .git suffix truncated
canonical.truncate(canonical.len() - 4);
}
Ok((format!("{}-{}", host, ident), canonical))
}
#[cfg(test)]
mod test {
#[test]
fn matches_cargo() {
assert_eq!(
super::url_to_local_dir(crate::INDEX_GIT_URL).unwrap(),
(
"github.com-1ecc6299db9ec823".to_owned(),
crate::INDEX_GIT_URL.to_owned()
)
);
// I've confirmed this also works with a custom registry, unfortunately
// that one includes a secret key as part of the url which would allow
// anyone to publish to the registry, so uhh...here's a fake one instead
assert_eq!(
super::url_to_local_dir(
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git"
)
.unwrap(),
(
"dl.cloudsmith.io-ff79e51ddd2b38fd".to_owned(),
"https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git".to_owned()
)
);
// Ensure we actually strip off the irrelevant parts of a url, note that
// the .git suffix is not part of the canonical url, but *is* used when hashing
assert_eq!(
super::url_to_local_dir(&format!(
"registry+{}.git?one=1&two=2#fragment",
crate::INDEX_GIT_URL
))
.unwrap(),
(
"github.com-c786010fb7ef2e6e".to_owned(),
crate::INDEX_GIT_URL.to_owned()
)
);
}
#[test]
fn bare_iterator() {
use super::Index;
let tmp_dir = tempfile::TempDir::new().unwrap();
let repo = Index::with_path(tmp_dir.path().to_owned(), crate::INDEX_GIT_URL)
.expect("Failed to clone crates.io index");
assert_eq!("time", repo.crate_("time").unwrap().name());
let mut found_gcc_crate = false;
let mut found_time_crate = false;
for c in repo.crates() {
if c.name() == "gcc" {
found_gcc_crate = true;
}
if c.name() == "time" {
found_time_crate = true;
}
}
assert!(found_gcc_crate);
assert!(found_time_crate);
}
#[test]
fn clones_bare_index() {
use super::Index;
let tmp_dir = tempfile::TempDir::new().unwrap();
let mut repo = Index::with_path(tmp_dir.path().to_owned(), crate::INDEX_GIT_URL)
.expect("Failed to clone crates.io index");
fn test_sval(repo: &Index) {
let krate = repo
.crate_("sval")
.expect("Could not find the crate sval in the index");
let version = krate
.versions()
.iter()
.find(|v| v.version() == "0.0.1")
.expect("Version 0.0.1 of sval does not exist?");
let dep_with_package_name = version
.dependencies()
.iter()
.find(|d| d.name() == "serde_lib")
.expect("sval does not have expected dependency?");
assert_ne!(
dep_with_package_name.name(),
dep_with_package_name.package().unwrap()
);
assert_eq!(
dep_with_package_name.crate_name(),
dep_with_package_name.package().unwrap()
);
}
test_sval(&repo);
repo.update().expect("Failed to fetch crates.io index");
test_sval(&repo);
}
#[test]
fn opens_bare_index() {
use super::Index;
let tmp_dir = tempfile::TempDir::new().unwrap();
let mut repo = Index::with_path(tmp_dir.path().to_owned(), crate::INDEX_GIT_URL)
.expect("Failed to open crates.io index");
fn test_sval(repo: &Index) {
let krate = repo
.crate_("sval")
.expect("Could not find the crate sval in the index");
let version = krate
.versions()
.iter()
.find(|v| v.version() == "0.0.1")
.expect("Version 0.0.1 of sval does not exist?");
let dep_with_package_name = version
.dependencies()
.iter()
.find(|d| d.name() == "serde_lib")
.expect("sval does not have expected dependency?");
assert_ne!(
dep_with_package_name.name(),
dep_with_package_name.package().unwrap()
);
assert_eq!(
dep_with_package_name.crate_name(),
dep_with_package_name.package().unwrap()
);
}
test_sval(&repo);
repo.update().expect("Failed to fetch crates.io index");
test_sval(&repo);
}
}